|
6 | 6 | against PersonalMemory.query() and reports hit@1 / hit@3 / false-positive |
7 | 7 | rate. See docs/specs/memory-recall-eval/requirements.md for the design. |
8 | 8 |
|
| 9 | +Modes (--phase): |
| 10 | + all capture + evaluate in one process (default; Runs 1-2) |
| 11 | + persistence capture and evaluate in two SEPARATE OS processes against |
| 12 | + the same global root, so the query side starts from a |
| 13 | + brand-new PersonalMemory instance with nothing in memory |
| 14 | + - proves recall survives process death (Run 3) |
| 15 | + capture write the corpus into --root (used by persistence mode) |
| 16 | + evaluate query against an already-captured --root (used by |
| 17 | + persistence mode; --json-out FILE for machine-readable |
| 18 | + output) |
| 19 | +
|
9 | 20 | Never touches real memory (~/.attune/personal_memory or a project's |
10 | | -.attune/memory/) - both roots are isolated temp directories. |
| 21 | +.attune/memory/) - the orchestrating modes use isolated temp |
| 22 | +directories; capture/evaluate take an explicit --root. |
11 | 23 | """ |
12 | 24 |
|
13 | 25 | from __future__ import annotations |
14 | 26 |
|
| 27 | +import argparse |
| 28 | +import json |
15 | 29 | import shutil |
| 30 | +import subprocess |
16 | 31 | import sys |
17 | 32 | import tempfile |
18 | 33 | from dataclasses import dataclass |
@@ -196,82 +211,131 @@ class Query: |
196 | 211 | ] |
197 | 212 |
|
198 | 213 |
|
199 | | -def run_benchmark() -> dict: |
200 | | - tmp_root = Path(tempfile.mkdtemp(prefix="attune_memory_recall_eval_")) |
201 | | - global_root = tmp_root / "global" |
202 | | - unused_project_root = tmp_root / "no_project_dir" # deliberately never created |
203 | | - global_root.mkdir(parents=True, exist_ok=True) |
| 214 | +def capture_corpus(global_root: Path) -> None: |
| 215 | + """Write the benchmark corpus into ``global_root`` via PersonalMemory.""" |
| 216 | + unused_project_root = global_root.parent / "no_project_dir" # never created |
| 217 | + pm = PersonalMemory(global_root=global_root, project_root=unused_project_root) |
| 218 | + for entry in CORPUS: |
| 219 | + pm.capture(entry.topic, entry.content, kind=entry.kind) |
204 | 220 |
|
205 | | - try: |
206 | | - pm = PersonalMemory(global_root=global_root, project_root=unused_project_root) |
207 | | - |
208 | | - for entry in CORPUS: |
209 | | - pm.capture(entry.topic, entry.content, kind=entry.kind) |
210 | | - |
211 | | - hit_at_1 = 0 |
212 | | - hit_at_3 = 0 |
213 | | - positive_queries = [q for q in QUERIES if q.expected_topic is not None] |
214 | | - negative_queries = [q for q in QUERIES if q.expected_topic is None] |
215 | | - failures: list[dict] = [] |
216 | | - |
217 | | - positive_top_scores: list[float] = [] |
218 | | - for q in positive_queries: |
219 | | - results = pm.query(q.text, k=3) |
220 | | - topics_returned = [Path(r["path"]).parent.name for r in results] |
221 | | - if results: |
222 | | - positive_top_scores.append(results[0]["score"]) |
223 | | - if topics_returned[:1] == [q.expected_topic]: |
224 | | - hit_at_1 += 1 |
225 | | - if q.expected_topic in topics_returned: |
226 | | - hit_at_3 += 1 |
227 | | - else: |
228 | | - failures.append( |
229 | | - { |
230 | | - "query": q.text, |
231 | | - "expected": q.expected_topic, |
232 | | - "got": topics_returned, |
233 | | - } |
234 | | - ) |
235 | | - |
236 | | - # NOTE: `score` is an unbounded raw keyword-overlap count (not a |
237 | | - # normalized [0,1] confidence), so there is no universal absolute |
238 | | - # threshold for "confident false positive." We report the actual |
239 | | - # top-1 score distributions for positive vs. negative queries and |
240 | | - # let the reader judge separation, rather than picking an arbitrary |
241 | | - # cutoff that could over- or under-state precision. |
242 | | - negative_top_scores: list[float] = [] |
243 | | - negative_hits: list[dict] = [] |
244 | | - for q in negative_queries: |
245 | | - results = pm.query(q.text, k=3) |
246 | | - top_score = results[0]["score"] if results else 0.0 |
247 | | - negative_top_scores.append(top_score) |
248 | | - negative_hits.append( |
| 221 | + |
| 222 | +def evaluate(global_root: Path) -> dict: |
| 223 | + """Run the ground-truth queries against an already-captured root.""" |
| 224 | + unused_project_root = global_root.parent / "no_project_dir" # never created |
| 225 | + pm = PersonalMemory(global_root=global_root, project_root=unused_project_root) |
| 226 | + |
| 227 | + hit_at_1 = 0 |
| 228 | + hit_at_3 = 0 |
| 229 | + positive_queries = [q for q in QUERIES if q.expected_topic is not None] |
| 230 | + negative_queries = [q for q in QUERIES if q.expected_topic is None] |
| 231 | + failures: list[dict] = [] |
| 232 | + |
| 233 | + positive_top_scores: list[float] = [] |
| 234 | + for q in positive_queries: |
| 235 | + results = pm.query(q.text, k=3) |
| 236 | + topics_returned = [Path(r["path"]).parent.name for r in results] |
| 237 | + if results: |
| 238 | + positive_top_scores.append(results[0]["score"]) |
| 239 | + if topics_returned[:1] == [q.expected_topic]: |
| 240 | + hit_at_1 += 1 |
| 241 | + if q.expected_topic in topics_returned: |
| 242 | + hit_at_3 += 1 |
| 243 | + else: |
| 244 | + failures.append( |
249 | 245 | { |
250 | 246 | "query": q.text, |
251 | | - "top_result": results[0]["path"] if results else None, |
252 | | - "score": top_score, |
| 247 | + "expected": q.expected_topic, |
| 248 | + "got": topics_returned, |
253 | 249 | } |
254 | 250 | ) |
255 | 251 |
|
256 | | - return { |
257 | | - "corpus_size": len(CORPUS), |
258 | | - "positive_queries": len(positive_queries), |
259 | | - "negative_queries": len(negative_queries), |
260 | | - "hit_at_1": hit_at_1, |
261 | | - "hit_at_1_rate": hit_at_1 / len(positive_queries), |
262 | | - "hit_at_3": hit_at_3, |
263 | | - "hit_at_3_rate": hit_at_3 / len(positive_queries), |
264 | | - "positive_top_scores": positive_top_scores, |
265 | | - "negative_top_scores": negative_top_scores, |
266 | | - "failures": failures, |
267 | | - "negative_hits": negative_hits, |
268 | | - } |
| 252 | + # NOTE: `score` is an unbounded raw keyword-overlap count (not a |
| 253 | + # normalized [0,1] confidence), so there is no universal absolute |
| 254 | + # threshold for "confident false positive." We report the actual |
| 255 | + # top-1 score distributions for positive vs. negative queries and |
| 256 | + # let the reader judge separation, rather than picking an arbitrary |
| 257 | + # cutoff that could over- or under-state precision. |
| 258 | + negative_top_scores: list[float] = [] |
| 259 | + negative_hits: list[dict] = [] |
| 260 | + for q in negative_queries: |
| 261 | + results = pm.query(q.text, k=3) |
| 262 | + top_score = results[0]["score"] if results else 0.0 |
| 263 | + negative_top_scores.append(top_score) |
| 264 | + negative_hits.append( |
| 265 | + { |
| 266 | + "query": q.text, |
| 267 | + "top_result": results[0]["path"] if results else None, |
| 268 | + "score": top_score, |
| 269 | + } |
| 270 | + ) |
| 271 | + |
| 272 | + return { |
| 273 | + "corpus_size": len(CORPUS), |
| 274 | + "positive_queries": len(positive_queries), |
| 275 | + "negative_queries": len(negative_queries), |
| 276 | + "hit_at_1": hit_at_1, |
| 277 | + "hit_at_1_rate": hit_at_1 / len(positive_queries), |
| 278 | + "hit_at_3": hit_at_3, |
| 279 | + "hit_at_3_rate": hit_at_3 / len(positive_queries), |
| 280 | + "positive_top_scores": positive_top_scores, |
| 281 | + "negative_top_scores": negative_top_scores, |
| 282 | + "failures": failures, |
| 283 | + "negative_hits": negative_hits, |
| 284 | + } |
| 285 | + |
| 286 | + |
| 287 | +def run_benchmark() -> dict: |
| 288 | + """Capture + evaluate within a single process (Runs 1-2 methodology).""" |
| 289 | + tmp_root = Path(tempfile.mkdtemp(prefix="attune_memory_recall_eval_")) |
| 290 | + global_root = tmp_root / "global" |
| 291 | + global_root.mkdir(parents=True, exist_ok=True) |
| 292 | + try: |
| 293 | + capture_corpus(global_root) |
| 294 | + return evaluate(global_root) |
269 | 295 | finally: |
270 | 296 | shutil.rmtree(tmp_root, ignore_errors=True) |
271 | 297 |
|
272 | 298 |
|
273 | | -def main() -> None: |
274 | | - results = run_benchmark() |
| 299 | +def run_persistence_benchmark() -> dict: |
| 300 | + """Capture and evaluate in two SEPARATE OS processes (Run 3 methodology). |
| 301 | +
|
| 302 | + The capture subprocess exits (taking its PersonalMemory instance and |
| 303 | + any process state with it) before the evaluate subprocess starts from |
| 304 | + a brand-new instance pointed at the same on-disk global root. Identical |
| 305 | + numbers to run_benchmark() prove recall is fully file-backed and |
| 306 | + survives process death. |
| 307 | + """ |
| 308 | + tmp_root = Path(tempfile.mkdtemp(prefix="attune_memory_recall_eval_persist_")) |
| 309 | + global_root = tmp_root / "global" |
| 310 | + global_root.mkdir(parents=True, exist_ok=True) |
| 311 | + script = str(Path(__file__).resolve()) |
| 312 | + try: |
| 313 | + subprocess.run( |
| 314 | + [sys.executable, script, "--phase", "capture", "--root", str(global_root)], |
| 315 | + check=True, |
| 316 | + ) |
| 317 | + # Results go through a file, not stdout - attune_rag's structlog |
| 318 | + # lines print to stdout and would corrupt inline JSON. |
| 319 | + json_out = tmp_root / "results.json" |
| 320 | + subprocess.run( |
| 321 | + [ |
| 322 | + sys.executable, |
| 323 | + script, |
| 324 | + "--phase", |
| 325 | + "evaluate", |
| 326 | + "--root", |
| 327 | + str(global_root), |
| 328 | + "--json-out", |
| 329 | + str(json_out), |
| 330 | + ], |
| 331 | + check=True, |
| 332 | + ) |
| 333 | + return json.loads(json_out.read_text(encoding="utf-8")) |
| 334 | + finally: |
| 335 | + shutil.rmtree(tmp_root, ignore_errors=True) |
| 336 | + |
| 337 | + |
| 338 | +def print_report(results: dict) -> None: |
275 | 339 | print(f"Corpus size: {results['corpus_size']}") |
276 | 340 | print(f"Positive queries: {results['positive_queries']}") |
277 | 341 | print(f"Negative queries: {results['negative_queries']}") |
@@ -302,5 +366,51 @@ def main() -> None: |
302 | 366 | print(f" top_result={f['top_result']!r} score={f['score']:.3f}") |
303 | 367 |
|
304 | 368 |
|
| 369 | +def main() -> None: |
| 370 | + parser = argparse.ArgumentParser(description=__doc__) |
| 371 | + parser.add_argument( |
| 372 | + "--phase", |
| 373 | + choices=("all", "persistence", "capture", "evaluate"), |
| 374 | + default="all", |
| 375 | + help="all = single-process benchmark (default); persistence = " |
| 376 | + "capture and evaluate in separate subprocesses; capture/evaluate " |
| 377 | + "= one half, against an explicit --root", |
| 378 | + ) |
| 379 | + parser.add_argument( |
| 380 | + "--root", |
| 381 | + type=Path, |
| 382 | + help="global-root directory (required for capture/evaluate phases)", |
| 383 | + ) |
| 384 | + parser.add_argument( |
| 385 | + "--json-out", |
| 386 | + type=Path, |
| 387 | + help="write raw JSON results to this file instead of printing the " |
| 388 | + "human report (evaluate phase)", |
| 389 | + ) |
| 390 | + args = parser.parse_args() |
| 391 | + |
| 392 | + if args.phase in ("capture", "evaluate") and args.root is None: |
| 393 | + parser.error(f"--phase {args.phase} requires --root") |
| 394 | + |
| 395 | + if args.phase == "capture": |
| 396 | + capture_corpus(args.root) |
| 397 | + return |
| 398 | + if args.phase == "evaluate": |
| 399 | + results = evaluate(args.root) |
| 400 | + if args.json_out: |
| 401 | + args.json_out.write_text(json.dumps(results), encoding="utf-8") |
| 402 | + else: |
| 403 | + print_report(results) |
| 404 | + return |
| 405 | + |
| 406 | + if args.phase == "persistence": |
| 407 | + results = run_persistence_benchmark() |
| 408 | + print("Mode: PERSISTENCE - capture and query ran in separate OS processes;") |
| 409 | + print("the query side used a brand-new PersonalMemory instance.\n") |
| 410 | + else: |
| 411 | + results = run_benchmark() |
| 412 | + print_report(results) |
| 413 | + |
| 414 | + |
305 | 415 | if __name__ == "__main__": |
306 | 416 | main() |
0 commit comments