|
| 1 | +"""Bench runner for ``supamem eval --regress``. |
| 2 | +
|
| 3 | +Loads a JSONL golden set (bundled or external), runs each query against |
| 4 | +:class:`supamem.retrieval.tuned_hybrid.TunedHybridBackend`, computes |
| 5 | +recall@5 via substring matching against each record's |
| 6 | +``required_substrings`` list, and aggregates to mean recall + p95 latency |
| 7 | ++ total tokens. ``--regress`` mode compares the aggregate to Phase 80.1 |
| 8 | +locked thresholds and exits non-zero on any breach (SC-9 regression gate). |
| 9 | +""" |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import json |
| 13 | +import logging |
| 14 | +import time |
| 15 | +from importlib import resources |
| 16 | +from pathlib import Path |
| 17 | +from typing import Any |
| 18 | + |
| 19 | +from supamem.config import ResolvedConfig |
| 20 | +from supamem.retrieval.tuned_hybrid import TunedHybridBackend |
| 21 | +from supamem.retrieval.types import RetrievedChunk |
| 22 | + |
| 23 | +log = logging.getLogger("supamem.eval.runner") |
| 24 | + |
| 25 | +# Phase 80.1 locked thresholds (D-19). |
| 26 | +BASELINE = { |
| 27 | + "mean_recall_at_5": 0.60, |
| 28 | + "total_tokens": 4000, |
| 29 | + "p95_latency_ms": 500, |
| 30 | +} |
| 31 | + |
| 32 | +BUNDLED_GOLDENS = "phase_80_1_tuned_hybrid.jsonl" |
| 33 | + |
| 34 | + |
| 35 | +def _load_goldens(path: str | None) -> list[dict[str, Any]]: |
| 36 | + """Load JSONL records from ``path`` or the bundled corpus.""" |
| 37 | + if path: |
| 38 | + body = Path(path).read_text(encoding="utf-8") |
| 39 | + else: |
| 40 | + # The goldens dir is a sub-package; resources.files works because |
| 41 | + # ``supamem.eval.goldens`` has its own __init__.py. |
| 42 | + files = resources.files("supamem.eval.goldens") |
| 43 | + target = files / BUNDLED_GOLDENS |
| 44 | + body = target.read_text(encoding="utf-8") |
| 45 | + out: list[dict[str, Any]] = [] |
| 46 | + for line in body.splitlines(): |
| 47 | + if not line.strip(): |
| 48 | + continue |
| 49 | + out.append(json.loads(line)) |
| 50 | + return out |
| 51 | + |
| 52 | + |
| 53 | +def _recall_at_5(retrieved: list[RetrievedChunk], required: list[str]) -> float: |
| 54 | + """Substring match: fraction of required substrings present in top-5 blob.""" |
| 55 | + if not required: |
| 56 | + return 0.0 |
| 57 | + blob = " ".join(c.text or "" for c in retrieved[:5]) |
| 58 | + hits = sum(1 for s in required if s in blob) |
| 59 | + return hits / len(required) |
| 60 | + |
| 61 | + |
| 62 | +def _percentile(values: list[float], pct: float) -> float: |
| 63 | + if not values: |
| 64 | + return 0.0 |
| 65 | + s = sorted(values) |
| 66 | + k = max(0, min(len(s) - 1, int(round(pct / 100.0 * (len(s) - 1))))) |
| 67 | + return float(s[k]) |
| 68 | + |
| 69 | + |
| 70 | +def _build_backend(config: ResolvedConfig) -> TunedHybridBackend: |
| 71 | + return TunedHybridBackend(config=config) |
| 72 | + |
| 73 | + |
| 74 | +def run_bench( |
| 75 | + *, |
| 76 | + regress: bool = False, |
| 77 | + goldens_path: str | None = None, |
| 78 | + config: ResolvedConfig | None = None, |
| 79 | +) -> int: |
| 80 | + """Run the bench. Returns 0 on pass, 1 on regression / fatal.""" |
| 81 | + cfg = config or ResolvedConfig() |
| 82 | + try: |
| 83 | + records = _load_goldens(goldens_path) |
| 84 | + except (FileNotFoundError, OSError) as exc: |
| 85 | + log.error("supamem eval: failed to load goldens: %s", exc) |
| 86 | + return 1 |
| 87 | + if not records: |
| 88 | + log.warning("supamem eval: no golden records loaded") |
| 89 | + return 1 |
| 90 | + |
| 91 | + backend = _build_backend(cfg) |
| 92 | + recalls: list[float] = [] |
| 93 | + latencies: list[float] = [] |
| 94 | + total_tokens = 0 |
| 95 | + rows: list[dict[str, Any]] = [] |
| 96 | + |
| 97 | + for rec in records: |
| 98 | + query = str(rec.get("query") or "").strip() |
| 99 | + required = list(rec.get("required_substrings") or []) |
| 100 | + if not query: |
| 101 | + continue |
| 102 | + t0 = time.perf_counter() |
| 103 | + try: |
| 104 | + chunks = backend.query(query, k=5) |
| 105 | + except Exception as exc: # noqa: BLE001 |
| 106 | + log.warning("supamem eval: query %r failed: %s", query, type(exc).__name__) |
| 107 | + chunks = [] |
| 108 | + elapsed = (time.perf_counter() - t0) * 1000.0 |
| 109 | + latencies.append(elapsed) |
| 110 | + recall = _recall_at_5(chunks, required) |
| 111 | + recalls.append(recall) |
| 112 | + total_tokens += sum(max(1, len(c.text or "") // 4) for c in chunks) |
| 113 | + rows.append({"id": rec.get("id"), "recall": recall, "latency_ms": elapsed}) |
| 114 | + |
| 115 | + mean_recall = sum(recalls) / len(recalls) if recalls else 0.0 |
| 116 | + p95 = _percentile(latencies, 95.0) |
| 117 | + summary = { |
| 118 | + "queries": len(records), |
| 119 | + "mean_recall_at_5": round(mean_recall, 4), |
| 120 | + "p95_latency_ms": round(p95, 2), |
| 121 | + "total_tokens": total_tokens, |
| 122 | + } |
| 123 | + |
| 124 | + print("supamem eval — bench summary") |
| 125 | + print(f" queries : {summary['queries']}") |
| 126 | + print(f" mean recall@5 : {summary['mean_recall_at_5']}") |
| 127 | + print(f" p95 latency (ms) : {summary['p95_latency_ms']}") |
| 128 | + print(f" total tokens : {summary['total_tokens']}") |
| 129 | + |
| 130 | + if not regress: |
| 131 | + return 0 |
| 132 | + |
| 133 | + breaches: list[str] = [] |
| 134 | + if mean_recall < BASELINE["mean_recall_at_5"]: |
| 135 | + breaches.append( |
| 136 | + f"mean_recall_at_5={mean_recall:.4f} < baseline {BASELINE['mean_recall_at_5']}" |
| 137 | + ) |
| 138 | + if total_tokens > BASELINE["total_tokens"]: |
| 139 | + breaches.append( |
| 140 | + f"total_tokens={total_tokens} > baseline {BASELINE['total_tokens']}" |
| 141 | + ) |
| 142 | + if p95 > BASELINE["p95_latency_ms"]: |
| 143 | + breaches.append( |
| 144 | + f"p95_latency_ms={p95:.2f} > baseline {BASELINE['p95_latency_ms']}" |
| 145 | + ) |
| 146 | + |
| 147 | + if breaches: |
| 148 | + print() |
| 149 | + print("supamem eval — REGRESSION:") |
| 150 | + for line in breaches: |
| 151 | + print(f" - {line}") |
| 152 | + return 1 |
| 153 | + |
| 154 | + print() |
| 155 | + print("supamem eval — regress: PASS") |
| 156 | + return 0 |
| 157 | + |
| 158 | + |
| 159 | +__all__ = ["BASELINE", "run_bench"] |
0 commit comments