Skip to content

Commit cfa96e8

Browse files
cdeustclaude
andcommitted
feat(verification): paper + reproducibility infrastructure for thermodynamic memory claims
Adds the paper "Thermodynamic memory vs flat-importance" (4,461 words + 4 appendices) with the full verification stack the paper's Limitations section depends on: - Pre-registered Fisher-style protocol (E1-E6) at tasks/verification-protocol.md - Curie measurement-discipline audit + 30-item HARD-STOP checklist - Six benchmark harnesses: ablation, N-scan, decay-sweep, longitudinal, cross-benchmark, and master verification report aggregator - Deterministic-replay primitives: db_snapshot.py (pg_dump+manifest, version drift enforcement, pgvector binary SHA-256 fingerprint), db_setup.py (autocommit-aware GUC pinning, full PG unit normaliser), noise_floor.py (per-rerun σ measurement) - HNSW determinism playbook + reloption pinning for all 4 cluster HNSW indexes (m=16, ef_construction=64) so future builds match the manifest - Telemetry: thread-safe counter store + JSONL audit log + 9 handlers instrumented for read/write ratio measurement (closes Popper C6) - CORTEX_DECAY_LAMBDA env var on thermodynamics.compute_decay() for the Phase E3 dose-response sweep Smoke runs land under benchmarks/results/ (decay sweep showed λ=0.95 MRR 0.671 vs λ=1.0 MRR 0.399 — 27pp gap; longitudinal stable across 360-day buckets, delta +0.010). Layer audit: zero mcp_server imports in benchmarks/lib/ determinism stack. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 18b4be4 commit cfa96e8

49 files changed

Lines changed: 5501 additions & 20 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ traces/
3030
# Claude Code ephemeral session state
3131
.claude/scheduled_tasks.lock
3232
.claude/session-cache.json
33+
benchmarks/snapshots/

benchmarks/lib/_xb_drivers.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""Subprocess drivers for cross_benchmark_runner.
2+
3+
Each driver is invoked as `python -m benchmarks.lib._xb_drivers <bench> <data_path> <limit>`
4+
in a fresh subprocess. Env vars (CORTEX_DECAY_LAMBDA, CORTEX_MEMORY_*) are set
5+
by the parent before exec — the driver does not parse them. The driver runs
6+
the inner benchmark loop and emits a single line `__JSON__{...}` to stdout.
7+
8+
Subprocess isolation is required because mcp_server.core.thermodynamics
9+
reads CORTEX_DECAY_LAMBDA at module import time and the lru_cache(maxsize=1)
10+
on get_memory_settings pins the first observed value.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import json
16+
import sys
17+
import time
18+
from collections import defaultdict
19+
from pathlib import Path
20+
21+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
22+
23+
24+
def _drive_longmemeval(data_path: str, limit: int) -> dict:
25+
from benchmarks.longmemeval.run_benchmark import run_benchmark
26+
r = run_benchmark(data_path, limit=limit)
27+
return {
28+
"mrr": r["overall_mrr"],
29+
"recall_at_10": r["overall_recall10"],
30+
"category_mrr": r.get("category_mrr", {}),
31+
"category_recall10": r.get("category_recall10", {}),
32+
"elapsed_s": r.get("elapsed_s", 0.0),
33+
}
34+
35+
36+
def _drive_locomo(data_path: str, limit: int) -> dict:
37+
from benchmarks.lib.bench_db import BenchmarkDB
38+
from benchmarks.locomo.data import extract_sessions, load_locomo
39+
from benchmarks.locomo.run_benchmark import evaluate_conversation
40+
41+
data = load_locomo(data_path)
42+
if limit:
43+
data = data[:limit]
44+
45+
agg: dict[str, list[dict]] = defaultdict(list)
46+
t0 = time.time()
47+
with BenchmarkDB() as db:
48+
for conv in data:
49+
sessions = extract_sessions(conv["conversation"])
50+
db.clear()
51+
memories = [
52+
{
53+
"content": s["content"],
54+
"user_content": s.get("user_content", ""),
55+
"created_at": s.get("date", ""),
56+
"source": f"session_{s['session_idx']}",
57+
"tags": ["locomo"],
58+
}
59+
for s in sessions
60+
]
61+
mids, smap = db.load_memories(memories, domain="locomo")
62+
cr = evaluate_conversation(db, sessions, mids, smap, conv["qa"])
63+
for c, rs in cr.items():
64+
agg[c].extend(rs)
65+
elapsed = time.time() - t0
66+
all_rs = [r for rs in agg.values() for r in rs]
67+
n = len(all_rs)
68+
mrr = sum(1.0 / r["hit_rank"] for r in all_rs if r["hit_rank"]) / n if n else 0.0
69+
r10 = (
70+
sum(1 for r in all_rs if r["hit_rank"] and r["hit_rank"] <= 10) / n if n else 0.0
71+
)
72+
cat_mrr: dict[str, float] = {}
73+
cat_r10: dict[str, float] = {}
74+
for cat, rs in agg.items():
75+
if not rs:
76+
continue
77+
m = len(rs)
78+
cat_mrr[cat] = sum(1.0 / r["hit_rank"] for r in rs if r["hit_rank"]) / m
79+
cat_r10[cat] = sum(
80+
1 for r in rs if r["hit_rank"] and r["hit_rank"] <= 10
81+
) / m
82+
return {
83+
"mrr": mrr,
84+
"recall_at_10": r10,
85+
"n_questions": n,
86+
"category_mrr": cat_mrr,
87+
"category_recall10": cat_r10,
88+
"elapsed_s": elapsed,
89+
}
90+
91+
92+
def main() -> int:
93+
if len(sys.argv) != 4:
94+
print("usage: _xb_drivers.py <longmemeval|locomo> <data_path> <limit>",
95+
file=sys.stderr)
96+
return 2
97+
bench, data_path, limit_s = sys.argv[1], sys.argv[2], sys.argv[3]
98+
limit = int(limit_s)
99+
if bench == "longmemeval":
100+
out = _drive_longmemeval(data_path, limit)
101+
elif bench == "locomo":
102+
out = _drive_locomo(data_path, limit)
103+
else:
104+
print(f"unknown benchmark: {bench}", file=sys.stderr)
105+
return 2
106+
print("__JSON__" + json.dumps(out))
107+
return 0
108+
109+
110+
if __name__ == "__main__":
111+
sys.exit(main())

0 commit comments

Comments
 (0)