Skip to content

Commit 225a756

Browse files
feat(scripts): cross-process persistence mode for recall benchmark (Run 3) (#1209)
Adds --phase {all,persistence,capture,evaluate} to scripts/memory_recall_eval.py. Persistence mode captures the corpus in one subprocess, lets it exit, then evaluates from a brand-new PersonalMemory instance in a second subprocess against the same on-disk global root. Result: identical to Run 2 in every dimension (hit@1 18/18, hit@3 18/18, same score distributions) - recall is fully file-backed and survives process death. Logged as Run 3 in docs/specs/memory-recall-eval/decisions.md. Results pass between subprocesses via a JSON file rather than stdout because attune_rag's structlog output prints to stdout.
1 parent 6ef1076 commit 225a756

2 files changed

Lines changed: 210 additions & 67 deletions

File tree

docs/specs/memory-recall-eval/decisions.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,36 @@ Precision has a soft edge case inherent to keyword-only retrieval, not
9797
worth chasing given `attune.memory.PersonalMemory` is still lightly
9898
used; revisit if/when real usage surfaces an actual bad-match incident,
9999
or if `attune_rag` grows a semantic retriever option.
100+
101+
## 2026-07-01 — Run 3: cross-process persistence confirmed
102+
103+
**Question:** Runs 1–2 captured and queried within the *same*
104+
`PersonalMemory` instance and process. Does recall survive process
105+
death — i.e., is the store genuinely file-backed with no hidden
106+
in-process state?
107+
108+
**Method:** added `--phase persistence` to
109+
[scripts/memory_recall_eval.py](../../../scripts/memory_recall_eval.py):
110+
the corpus is captured by one subprocess, which then **exits** (taking
111+
its `PersonalMemory` instance with it); a second subprocess constructs
112+
a brand-new instance pointed at the same on-disk `global_root` and runs
113+
the identical query set. Results pass back via a JSON file (not stdout
114+
`attune_rag`'s structlog lines print to stdout and corrupt inline
115+
JSON; noted here in case a future consumer tries to pipe it).
116+
117+
**Result: identical to Run 2 in every dimension.**
118+
119+
- hit@1 = 18/18 (100%), hit@3 = 18/18 (100%)
120+
- Positive top-1 scores: `[4.5, 7.0, 8.0, 9.0, 10.0, 10.0, 10.0, 11.5,
121+
12.0, 12.5, 13.0, 14.0, 14.0, 14.5, 16.5, 18.5, 18.5, 21.0]` — same
122+
- Negative top-1 scores: `[0.0, 2.5, 2.5, 3.0, 5.5]` — same, including
123+
the same soft-overlap case (`test-flake-quarantine-policy` at 5.5)
124+
125+
**Verdict: persistence holds.** Capture-side writes are durable and the
126+
query side reconstructs retrieval purely from disk — no warm-instance
127+
advantage, no cold-start penalty, no state lost at process exit. The
128+
"probably fine mechanically" assumption from the session handoff is now
129+
a measured fact. The single-process default (`--phase all`) reproduces
130+
the same numbers, so the two methodologies are interchangeable for
131+
future runs; use `--phase persistence` when the change under test
132+
touches serialization or file layout.

scripts/memory_recall_eval.py

Lines changed: 177 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,28 @@
66
against PersonalMemory.query() and reports hit@1 / hit@3 / false-positive
77
rate. See docs/specs/memory-recall-eval/requirements.md for the design.
88
9+
Modes (--phase):
10+
all capture + evaluate in one process (default; Runs 1-2)
11+
persistence capture and evaluate in two SEPARATE OS processes against
12+
the same global root, so the query side starts from a
13+
brand-new PersonalMemory instance with nothing in memory
14+
- proves recall survives process death (Run 3)
15+
capture write the corpus into --root (used by persistence mode)
16+
evaluate query against an already-captured --root (used by
17+
persistence mode; --json-out FILE for machine-readable
18+
output)
19+
920
Never touches real memory (~/.attune/personal_memory or a project's
10-
.attune/memory/) - both roots are isolated temp directories.
21+
.attune/memory/) - the orchestrating modes use isolated temp
22+
directories; capture/evaluate take an explicit --root.
1123
"""
1224

1325
from __future__ import annotations
1426

27+
import argparse
28+
import json
1529
import shutil
30+
import subprocess
1631
import sys
1732
import tempfile
1833
from dataclasses import dataclass
@@ -196,82 +211,131 @@ class Query:
196211
]
197212

198213

199-
def run_benchmark() -> dict:
200-
tmp_root = Path(tempfile.mkdtemp(prefix="attune_memory_recall_eval_"))
201-
global_root = tmp_root / "global"
202-
unused_project_root = tmp_root / "no_project_dir" # deliberately never created
203-
global_root.mkdir(parents=True, exist_ok=True)
214+
def capture_corpus(global_root: Path) -> None:
215+
"""Write the benchmark corpus into ``global_root`` via PersonalMemory."""
216+
unused_project_root = global_root.parent / "no_project_dir" # never created
217+
pm = PersonalMemory(global_root=global_root, project_root=unused_project_root)
218+
for entry in CORPUS:
219+
pm.capture(entry.topic, entry.content, kind=entry.kind)
204220

205-
try:
206-
pm = PersonalMemory(global_root=global_root, project_root=unused_project_root)
207-
208-
for entry in CORPUS:
209-
pm.capture(entry.topic, entry.content, kind=entry.kind)
210-
211-
hit_at_1 = 0
212-
hit_at_3 = 0
213-
positive_queries = [q for q in QUERIES if q.expected_topic is not None]
214-
negative_queries = [q for q in QUERIES if q.expected_topic is None]
215-
failures: list[dict] = []
216-
217-
positive_top_scores: list[float] = []
218-
for q in positive_queries:
219-
results = pm.query(q.text, k=3)
220-
topics_returned = [Path(r["path"]).parent.name for r in results]
221-
if results:
222-
positive_top_scores.append(results[0]["score"])
223-
if topics_returned[:1] == [q.expected_topic]:
224-
hit_at_1 += 1
225-
if q.expected_topic in topics_returned:
226-
hit_at_3 += 1
227-
else:
228-
failures.append(
229-
{
230-
"query": q.text,
231-
"expected": q.expected_topic,
232-
"got": topics_returned,
233-
}
234-
)
235-
236-
# NOTE: `score` is an unbounded raw keyword-overlap count (not a
237-
# normalized [0,1] confidence), so there is no universal absolute
238-
# threshold for "confident false positive." We report the actual
239-
# top-1 score distributions for positive vs. negative queries and
240-
# let the reader judge separation, rather than picking an arbitrary
241-
# cutoff that could over- or under-state precision.
242-
negative_top_scores: list[float] = []
243-
negative_hits: list[dict] = []
244-
for q in negative_queries:
245-
results = pm.query(q.text, k=3)
246-
top_score = results[0]["score"] if results else 0.0
247-
negative_top_scores.append(top_score)
248-
negative_hits.append(
221+
222+
def evaluate(global_root: Path) -> dict:
223+
"""Run the ground-truth queries against an already-captured root."""
224+
unused_project_root = global_root.parent / "no_project_dir" # never created
225+
pm = PersonalMemory(global_root=global_root, project_root=unused_project_root)
226+
227+
hit_at_1 = 0
228+
hit_at_3 = 0
229+
positive_queries = [q for q in QUERIES if q.expected_topic is not None]
230+
negative_queries = [q for q in QUERIES if q.expected_topic is None]
231+
failures: list[dict] = []
232+
233+
positive_top_scores: list[float] = []
234+
for q in positive_queries:
235+
results = pm.query(q.text, k=3)
236+
topics_returned = [Path(r["path"]).parent.name for r in results]
237+
if results:
238+
positive_top_scores.append(results[0]["score"])
239+
if topics_returned[:1] == [q.expected_topic]:
240+
hit_at_1 += 1
241+
if q.expected_topic in topics_returned:
242+
hit_at_3 += 1
243+
else:
244+
failures.append(
249245
{
250246
"query": q.text,
251-
"top_result": results[0]["path"] if results else None,
252-
"score": top_score,
247+
"expected": q.expected_topic,
248+
"got": topics_returned,
253249
}
254250
)
255251

256-
return {
257-
"corpus_size": len(CORPUS),
258-
"positive_queries": len(positive_queries),
259-
"negative_queries": len(negative_queries),
260-
"hit_at_1": hit_at_1,
261-
"hit_at_1_rate": hit_at_1 / len(positive_queries),
262-
"hit_at_3": hit_at_3,
263-
"hit_at_3_rate": hit_at_3 / len(positive_queries),
264-
"positive_top_scores": positive_top_scores,
265-
"negative_top_scores": negative_top_scores,
266-
"failures": failures,
267-
"negative_hits": negative_hits,
268-
}
252+
# NOTE: `score` is an unbounded raw keyword-overlap count (not a
253+
# normalized [0,1] confidence), so there is no universal absolute
254+
# threshold for "confident false positive." We report the actual
255+
# top-1 score distributions for positive vs. negative queries and
256+
# let the reader judge separation, rather than picking an arbitrary
257+
# cutoff that could over- or under-state precision.
258+
negative_top_scores: list[float] = []
259+
negative_hits: list[dict] = []
260+
for q in negative_queries:
261+
results = pm.query(q.text, k=3)
262+
top_score = results[0]["score"] if results else 0.0
263+
negative_top_scores.append(top_score)
264+
negative_hits.append(
265+
{
266+
"query": q.text,
267+
"top_result": results[0]["path"] if results else None,
268+
"score": top_score,
269+
}
270+
)
271+
272+
return {
273+
"corpus_size": len(CORPUS),
274+
"positive_queries": len(positive_queries),
275+
"negative_queries": len(negative_queries),
276+
"hit_at_1": hit_at_1,
277+
"hit_at_1_rate": hit_at_1 / len(positive_queries),
278+
"hit_at_3": hit_at_3,
279+
"hit_at_3_rate": hit_at_3 / len(positive_queries),
280+
"positive_top_scores": positive_top_scores,
281+
"negative_top_scores": negative_top_scores,
282+
"failures": failures,
283+
"negative_hits": negative_hits,
284+
}
285+
286+
287+
def run_benchmark() -> dict:
288+
"""Capture + evaluate within a single process (Runs 1-2 methodology)."""
289+
tmp_root = Path(tempfile.mkdtemp(prefix="attune_memory_recall_eval_"))
290+
global_root = tmp_root / "global"
291+
global_root.mkdir(parents=True, exist_ok=True)
292+
try:
293+
capture_corpus(global_root)
294+
return evaluate(global_root)
269295
finally:
270296
shutil.rmtree(tmp_root, ignore_errors=True)
271297

272298

273-
def main() -> None:
274-
results = run_benchmark()
299+
def run_persistence_benchmark() -> dict:
300+
"""Capture and evaluate in two SEPARATE OS processes (Run 3 methodology).
301+
302+
The capture subprocess exits (taking its PersonalMemory instance and
303+
any process state with it) before the evaluate subprocess starts from
304+
a brand-new instance pointed at the same on-disk global root. Identical
305+
numbers to run_benchmark() prove recall is fully file-backed and
306+
survives process death.
307+
"""
308+
tmp_root = Path(tempfile.mkdtemp(prefix="attune_memory_recall_eval_persist_"))
309+
global_root = tmp_root / "global"
310+
global_root.mkdir(parents=True, exist_ok=True)
311+
script = str(Path(__file__).resolve())
312+
try:
313+
subprocess.run(
314+
[sys.executable, script, "--phase", "capture", "--root", str(global_root)],
315+
check=True,
316+
)
317+
# Results go through a file, not stdout - attune_rag's structlog
318+
# lines print to stdout and would corrupt inline JSON.
319+
json_out = tmp_root / "results.json"
320+
subprocess.run(
321+
[
322+
sys.executable,
323+
script,
324+
"--phase",
325+
"evaluate",
326+
"--root",
327+
str(global_root),
328+
"--json-out",
329+
str(json_out),
330+
],
331+
check=True,
332+
)
333+
return json.loads(json_out.read_text(encoding="utf-8"))
334+
finally:
335+
shutil.rmtree(tmp_root, ignore_errors=True)
336+
337+
338+
def print_report(results: dict) -> None:
275339
print(f"Corpus size: {results['corpus_size']}")
276340
print(f"Positive queries: {results['positive_queries']}")
277341
print(f"Negative queries: {results['negative_queries']}")
@@ -302,5 +366,51 @@ def main() -> None:
302366
print(f" top_result={f['top_result']!r} score={f['score']:.3f}")
303367

304368

369+
def main() -> None:
370+
parser = argparse.ArgumentParser(description=__doc__)
371+
parser.add_argument(
372+
"--phase",
373+
choices=("all", "persistence", "capture", "evaluate"),
374+
default="all",
375+
help="all = single-process benchmark (default); persistence = "
376+
"capture and evaluate in separate subprocesses; capture/evaluate "
377+
"= one half, against an explicit --root",
378+
)
379+
parser.add_argument(
380+
"--root",
381+
type=Path,
382+
help="global-root directory (required for capture/evaluate phases)",
383+
)
384+
parser.add_argument(
385+
"--json-out",
386+
type=Path,
387+
help="write raw JSON results to this file instead of printing the "
388+
"human report (evaluate phase)",
389+
)
390+
args = parser.parse_args()
391+
392+
if args.phase in ("capture", "evaluate") and args.root is None:
393+
parser.error(f"--phase {args.phase} requires --root")
394+
395+
if args.phase == "capture":
396+
capture_corpus(args.root)
397+
return
398+
if args.phase == "evaluate":
399+
results = evaluate(args.root)
400+
if args.json_out:
401+
args.json_out.write_text(json.dumps(results), encoding="utf-8")
402+
else:
403+
print_report(results)
404+
return
405+
406+
if args.phase == "persistence":
407+
results = run_persistence_benchmark()
408+
print("Mode: PERSISTENCE - capture and query ran in separate OS processes;")
409+
print("the query side used a brand-new PersonalMemory instance.\n")
410+
else:
411+
results = run_benchmark()
412+
print_report(results)
413+
414+
305415
if __name__ == "__main__":
306416
main()

0 commit comments

Comments
 (0)