Skip to content

Commit ef178da

Browse files
cdeustclaude
andcommitted
feat(verif): E1 v3 LoCoMo driver — 14-row two-baseline sweep
Mirror of benchmarks/lib/run_e1_v3_lme.py for LoCoMo: - BASELINE_NO_CONSOLIDATION (anchor for longitudinal read-path mechs) - 3 longitudinal read-path rows: RECONSOLIDATION, CO_ACTIVATION, ADAPTIVE_DECAY - BASELINE_WITH_CONSOLIDATION (anchor for consolidation-only mechs) - 9 consolidation rows: CASCADE, INTERFERENCE, HOMEOSTATIC_PLASTICITY, SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING, TWO_STAGE_MODEL, EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE, SCHEMA_ENGINE Serial subprocess loop, mid-run-crash-friendly summary.csv update per row. Designed to launch against 6c51bce (consolidation cadence migrated to ingested_at; LoCoMo MRR_with_consolidation no longer collapses to 0.222). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6c51bce commit ef178da

1 file changed

Lines changed: 286 additions & 0 deletions

File tree

benchmarks/lib/run_e1_v3_locomo.py

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
"""E1 v3 — LoCoMo per-mechanism ablation runner (two-baseline design).
2+
3+
Drives `benchmarks/locomo/run_benchmark.py` once per row, serially, against
4+
the same PG instance. Each row writes its result JSON via `--results-out`.
5+
After all rows complete, an aggregate `summary.csv` and `manifest.json`
6+
are written.
7+
8+
Output: benchmarks/results/ablation/locomo_v3/
9+
10+
Why serial: the harness mutates a shared PG database (db.clear() per
11+
conversation). Parallel rows would contaminate each other's haystacks.
12+
13+
Two-baseline design (per tasks/e1-v3-locomo-smoke-finding.md, Option B):
14+
15+
LoCoMo session timestamps are real 2023 conversation dates. At 2026 wall
16+
time, every loaded memory is ≈3 years old. Cortex's compression gates
17+
(COMPRESSION_GIST_AGE_HOURS=168, COMPRESSION_TAG_AGE_HOURS=720) fire on
18+
absolute timestamp diff, so consolidation collapses the corpus to gists/
19+
tags on first pass. Smoke: MRR 0.866 (no consolidation) → 0.222 (with).
20+
21+
To preserve honest per-mechanism evidence:
22+
23+
- Longitudinal read-path mechanisms (RECONSOLIDATION, CO_ACTIVATION,
24+
ADAPTIVE_DECAY) are ablated against BASELINE_NO_CONSOLIDATION. These do
25+
not require a consolidation pass — their effect is heat / access / co-
26+
access tracking that accumulates via cross-question reads.
27+
28+
- Consolidation-only mechanisms (CASCADE, INTERFERENCE,
29+
HOMEOSTATIC_PLASTICITY, SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING,
30+
TWO_STAGE_MODEL, EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE, SCHEMA_ENGINE) are
31+
ablated against BASELINE_WITH_CONSOLIDATION. Each row's delta is the
32+
mechanism's role within the observed (timestamp-collision) regime;
33+
this is documented as a benchmark-property disclosure in the writeup.
34+
35+
14 rows total. Estimated wall ~7h.
36+
"""
37+
38+
from __future__ import annotations
39+
40+
import csv
41+
import json
42+
import subprocess
43+
import sys
44+
import time
45+
from dataclasses import dataclass
46+
from datetime import datetime, timezone
47+
from pathlib import Path
48+
49+
_ROOT = Path(__file__).resolve().parents[2]
50+
_OUT_DIR = _ROOT / "benchmarks" / "results" / "ablation" / "locomo_v3"
51+
_HARNESS = _ROOT / "benchmarks" / "locomo" / "run_benchmark.py"
52+
53+
54+
@dataclass
55+
class Row:
56+
label: str
57+
ablate: str | None
58+
with_consolidation: bool
59+
anchor: str # the BASELINE label this row's delta is computed against
60+
61+
62+
# Two-baseline 14-row design.
63+
ROWS: list[Row] = [
64+
# === Baseline 1: NO consolidation, anchor for longitudinal read-path mechs ===
65+
Row("BASELINE_NO_CONSOLIDATION", ablate=None,
66+
with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"),
67+
68+
# Longitudinal read-path — ablated vs NO_CONSOLIDATION
69+
Row("RECONSOLIDATION", ablate="RECONSOLIDATION",
70+
with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"),
71+
Row("CO_ACTIVATION", ablate="CO_ACTIVATION",
72+
with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"),
73+
Row("ADAPTIVE_DECAY", ablate="ADAPTIVE_DECAY",
74+
with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"),
75+
76+
# === Baseline 2: WITH consolidation, anchor for consolidation-only mechs ===
77+
Row("BASELINE_WITH_CONSOLIDATION", ablate=None,
78+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
79+
80+
# Consolidation-only — ablated vs WITH_CONSOLIDATION
81+
Row("CASCADE", ablate="CASCADE",
82+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
83+
Row("INTERFERENCE", ablate="INTERFERENCE",
84+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
85+
Row("HOMEOSTATIC_PLASTICITY", ablate="HOMEOSTATIC_PLASTICITY",
86+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
87+
Row("SYNAPTIC_PLASTICITY", ablate="SYNAPTIC_PLASTICITY",
88+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
89+
Row("MICROGLIAL_PRUNING", ablate="MICROGLIAL_PRUNING",
90+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
91+
Row("TWO_STAGE_MODEL", ablate="TWO_STAGE_MODEL",
92+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
93+
Row("EMOTIONAL_DECAY", ablate="EMOTIONAL_DECAY",
94+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
95+
Row("TRIPARTITE_SYNAPSE", ablate="TRIPARTITE_SYNAPSE",
96+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
97+
Row("SCHEMA_ENGINE", ablate="SCHEMA_ENGINE",
98+
with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"),
99+
]
100+
101+
102+
def _git_sha() -> str:
103+
out = subprocess.run(
104+
["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=_ROOT
105+
)
106+
return out.stdout.strip()
107+
108+
109+
def _git_dirty() -> bool:
110+
out = subprocess.run(
111+
["git", "diff", "--stat", "--ignore-submodules=all", "HEAD"],
112+
capture_output=True,
113+
text=True,
114+
cwd=_ROOT,
115+
)
116+
return bool(out.stdout.strip())
117+
118+
119+
def _run_row(row: Row) -> dict:
120+
out_path = _OUT_DIR / f"{row.label}.json"
121+
cmd = [
122+
"uv", "run", "python", str(_HARNESS),
123+
"--results-out", str(out_path),
124+
]
125+
if row.with_consolidation:
126+
cmd.append("--with-consolidation")
127+
if row.ablate is not None:
128+
cmd += ["--ablate", row.ablate]
129+
130+
print(f"\n{'=' * 70}", flush=True)
131+
print(f"[E1v3-LoCoMo] {row.label} — start {datetime.now(timezone.utc).isoformat()}",
132+
flush=True)
133+
print(f"[E1v3-LoCoMo] cmd: {' '.join(cmd)}", flush=True)
134+
print(f"{'=' * 70}", flush=True)
135+
136+
t0 = time.time()
137+
proc = subprocess.run(cmd, cwd=_ROOT)
138+
wall = time.time() - t0
139+
140+
rc = proc.returncode
141+
if rc != 0:
142+
print(f"[E1v3-LoCoMo][ERROR] {row.label} returncode={rc}",
143+
file=sys.stderr, flush=True)
144+
145+
mrr: float | None = None
146+
r10: float | None = None
147+
cat_mrr: dict[str, float] = {}
148+
cat_r10: dict[str, float] = {}
149+
if out_path.exists():
150+
try:
151+
data = json.loads(out_path.read_text())
152+
mrr = data.get("overall_mrr")
153+
r10 = data.get("overall_recall10")
154+
cat_mrr = data.get("category_mrr", {}) or {}
155+
cat_r10 = data.get("category_recall10", {}) or {}
156+
except Exception as e:
157+
print(f"[E1v3-LoCoMo][WARN] {row.label} parse {out_path}: {e}",
158+
file=sys.stderr, flush=True)
159+
160+
print(f"[E1v3-LoCoMo] {row.label} done — rc={rc} wall={wall:.1f}s "
161+
f"mrr={mrr} r10={r10}", flush=True)
162+
return {
163+
"row": row.label,
164+
"anchor": row.anchor,
165+
"with_consolidation": row.with_consolidation,
166+
"ablate": row.ablate,
167+
"mrr": mrr,
168+
"r10": r10,
169+
"category_mrr": cat_mrr,
170+
"category_recall10": cat_r10,
171+
"wall_seconds": wall,
172+
"returncode": rc,
173+
}
174+
175+
176+
def _write_summary(rows: list[dict]) -> None:
177+
# Build anchor → (mrr, r10) map.
178+
anchors: dict[str, tuple[float | None, float | None]] = {}
179+
for r in rows:
180+
if r["row"] == r["anchor"]:
181+
anchors[r["row"]] = (r["mrr"], r["r10"])
182+
183+
summary_path = _OUT_DIR / "summary.csv"
184+
with summary_path.open("w", newline="") as f:
185+
w = csv.writer(f)
186+
w.writerow([
187+
"row", "anchor", "with_consolidation", "ablate",
188+
"mrr", "r10", "wall_seconds", "returncode",
189+
"delta_mrr_vs_anchor", "delta_r10_vs_anchor",
190+
])
191+
for r in rows:
192+
d_mrr = ""
193+
d_r10 = ""
194+
if r["row"] != r["anchor"]:
195+
a_mrr, a_r10 = anchors.get(r["anchor"], (None, None))
196+
if a_mrr is not None and r["mrr"] is not None:
197+
# positive delta = ablation hurt the metric
198+
d_mrr = f"{a_mrr - r['mrr']:.4f}"
199+
if a_r10 is not None and r["r10"] is not None:
200+
d_r10 = f"{a_r10 - r['r10']:.4f}"
201+
w.writerow([
202+
r["row"], r["anchor"], r["with_consolidation"],
203+
r["ablate"] or "",
204+
"" if r["mrr"] is None else f"{r['mrr']:.4f}",
205+
"" if r["r10"] is None else f"{r['r10']:.4f}",
206+
f"{r['wall_seconds']:.1f}",
207+
r["returncode"], d_mrr, d_r10,
208+
])
209+
print(f"[E1v3-LoCoMo] summary → {summary_path}", flush=True)
210+
211+
212+
def main() -> int:
213+
_OUT_DIR.mkdir(parents=True, exist_ok=True)
214+
215+
sha = _git_sha()
216+
dirty = _git_dirty()
217+
started_at = datetime.now(timezone.utc).isoformat()
218+
219+
manifest = {
220+
"code_hash": sha,
221+
"dirty": dirty,
222+
"started_at": started_at,
223+
"n_rows": len(ROWS),
224+
"design": "two-baseline",
225+
"rows_spec": [
226+
{
227+
"label": r.label,
228+
"ablate": r.ablate,
229+
"with_consolidation": r.with_consolidation,
230+
"anchor": r.anchor,
231+
}
232+
for r in ROWS
233+
],
234+
"rows": [],
235+
}
236+
manifest_path = _OUT_DIR / "manifest.json"
237+
manifest_path.write_text(json.dumps(manifest, indent=2))
238+
239+
if dirty:
240+
print("[E1v3-LoCoMo][FATAL] tree is dirty; refusing to launch.",
241+
file=sys.stderr, flush=True)
242+
return 2
243+
244+
rows: list[dict] = []
245+
246+
# 1. BASELINE_NO_CONSOLIDATION first — sanity gate vs CLAUDE.md (~0.794 MRR).
247+
rows.append(_run_row(ROWS[0]))
248+
if rows[0]["returncode"] != 0 or rows[0]["mrr"] is None:
249+
print("[E1v3-LoCoMo][FATAL] BASELINE_NO_CONSOLIDATION failed; aborting.",
250+
file=sys.stderr, flush=True)
251+
manifest["rows"] = rows
252+
manifest["aborted"] = "baseline_no_consolidation_failed"
253+
manifest_path.write_text(json.dumps(manifest, indent=2, default=str))
254+
_write_summary(rows)
255+
return 1
256+
257+
# CLAUDE.md sanity tolerance: ±0.05 around 0.794. Soft warning if outside.
258+
bn_mrr = rows[0]["mrr"]
259+
if abs(bn_mrr - 0.794) > 0.05:
260+
print(
261+
f"[E1v3-LoCoMo][WARN] BASELINE_NO_CONSOLIDATION MRR={bn_mrr:.3f} "
262+
f"deviates >0.05 from CLAUDE.md headline 0.794. Continuing — "
263+
f"document in writeup.",
264+
file=sys.stderr, flush=True,
265+
)
266+
267+
# 2. Remaining 13 rows.
268+
for row in ROWS[1:]:
269+
rows.append(_run_row(row))
270+
manifest["rows"] = rows
271+
manifest["last_completed_at"] = datetime.now(timezone.utc).isoformat()
272+
manifest_path.write_text(json.dumps(manifest, indent=2, default=str))
273+
_write_summary(rows)
274+
275+
manifest["finished_at"] = datetime.now(timezone.utc).isoformat()
276+
manifest["rows"] = rows
277+
manifest_path.write_text(json.dumps(manifest, indent=2, default=str))
278+
_write_summary(rows)
279+
280+
nonzero = [r for r in rows if r["returncode"] != 0]
281+
print(f"\n[E1v3-LoCoMo] complete. nonzero rows: {len(nonzero)}", flush=True)
282+
return 0 if not nonzero else 1
283+
284+
285+
if __name__ == "__main__":
286+
sys.exit(main())

0 commit comments

Comments
 (0)