|
| 1 | +"""E1 v3 — LoCoMo per-mechanism ablation runner (two-baseline design). |
| 2 | +
|
| 3 | +Drives `benchmarks/locomo/run_benchmark.py` once per row, serially, against |
| 4 | +the same PG instance. Each row writes its result JSON via `--results-out`. |
| 5 | +After all rows complete, an aggregate `summary.csv` and `manifest.json` |
| 6 | +are written. |
| 7 | +
|
| 8 | +Output: benchmarks/results/ablation/locomo_v3/ |
| 9 | +
|
| 10 | +Why serial: the harness mutates a shared PG database (db.clear() per |
| 11 | +conversation). Parallel rows would contaminate each other's haystacks. |
| 12 | +
|
| 13 | +Two-baseline design (per tasks/e1-v3-locomo-smoke-finding.md, Option B): |
| 14 | +
|
| 15 | +LoCoMo session timestamps are real 2023 conversation dates. At 2026 wall |
| 16 | +time, every loaded memory is ≈3 years old. Cortex's compression gates |
| 17 | +(COMPRESSION_GIST_AGE_HOURS=168, COMPRESSION_TAG_AGE_HOURS=720) fire on |
| 18 | +absolute timestamp diff, so consolidation collapses the corpus to gists/ |
| 19 | +tags on first pass. Smoke: MRR 0.866 (no consolidation) → 0.222 (with). |
| 20 | +
|
| 21 | +To preserve honest per-mechanism evidence: |
| 22 | +
|
| 23 | +- Longitudinal read-path mechanisms (RECONSOLIDATION, CO_ACTIVATION, |
| 24 | + ADAPTIVE_DECAY) are ablated against BASELINE_NO_CONSOLIDATION. These do |
| 25 | + not require a consolidation pass — their effect is heat / access / co- |
| 26 | + access tracking that accumulates via cross-question reads. |
| 27 | +
|
| 28 | +- Consolidation-only mechanisms (CASCADE, INTERFERENCE, |
| 29 | + HOMEOSTATIC_PLASTICITY, SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING, |
| 30 | + TWO_STAGE_MODEL, EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE, SCHEMA_ENGINE) are |
| 31 | + ablated against BASELINE_WITH_CONSOLIDATION. Each row's delta is the |
| 32 | + mechanism's role within the observed (timestamp-collision) regime; |
| 33 | + this is documented as a benchmark-property disclosure in the writeup. |
| 34 | +
|
| 35 | +14 rows total. Estimated wall ~7h. |
| 36 | +""" |
| 37 | + |
| 38 | +from __future__ import annotations |
| 39 | + |
| 40 | +import csv |
| 41 | +import json |
| 42 | +import subprocess |
| 43 | +import sys |
| 44 | +import time |
| 45 | +from dataclasses import dataclass |
| 46 | +from datetime import datetime, timezone |
| 47 | +from pathlib import Path |
| 48 | + |
| 49 | +_ROOT = Path(__file__).resolve().parents[2] |
| 50 | +_OUT_DIR = _ROOT / "benchmarks" / "results" / "ablation" / "locomo_v3" |
| 51 | +_HARNESS = _ROOT / "benchmarks" / "locomo" / "run_benchmark.py" |
| 52 | + |
| 53 | + |
| 54 | +@dataclass |
| 55 | +class Row: |
| 56 | + label: str |
| 57 | + ablate: str | None |
| 58 | + with_consolidation: bool |
| 59 | + anchor: str # the BASELINE label this row's delta is computed against |
| 60 | + |
| 61 | + |
| 62 | +# Two-baseline 14-row design. |
| 63 | +ROWS: list[Row] = [ |
| 64 | + # === Baseline 1: NO consolidation, anchor for longitudinal read-path mechs === |
| 65 | + Row("BASELINE_NO_CONSOLIDATION", ablate=None, |
| 66 | + with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"), |
| 67 | + |
| 68 | + # Longitudinal read-path — ablated vs NO_CONSOLIDATION |
| 69 | + Row("RECONSOLIDATION", ablate="RECONSOLIDATION", |
| 70 | + with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"), |
| 71 | + Row("CO_ACTIVATION", ablate="CO_ACTIVATION", |
| 72 | + with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"), |
| 73 | + Row("ADAPTIVE_DECAY", ablate="ADAPTIVE_DECAY", |
| 74 | + with_consolidation=False, anchor="BASELINE_NO_CONSOLIDATION"), |
| 75 | + |
| 76 | + # === Baseline 2: WITH consolidation, anchor for consolidation-only mechs === |
| 77 | + Row("BASELINE_WITH_CONSOLIDATION", ablate=None, |
| 78 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 79 | + |
| 80 | + # Consolidation-only — ablated vs WITH_CONSOLIDATION |
| 81 | + Row("CASCADE", ablate="CASCADE", |
| 82 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 83 | + Row("INTERFERENCE", ablate="INTERFERENCE", |
| 84 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 85 | + Row("HOMEOSTATIC_PLASTICITY", ablate="HOMEOSTATIC_PLASTICITY", |
| 86 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 87 | + Row("SYNAPTIC_PLASTICITY", ablate="SYNAPTIC_PLASTICITY", |
| 88 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 89 | + Row("MICROGLIAL_PRUNING", ablate="MICROGLIAL_PRUNING", |
| 90 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 91 | + Row("TWO_STAGE_MODEL", ablate="TWO_STAGE_MODEL", |
| 92 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 93 | + Row("EMOTIONAL_DECAY", ablate="EMOTIONAL_DECAY", |
| 94 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 95 | + Row("TRIPARTITE_SYNAPSE", ablate="TRIPARTITE_SYNAPSE", |
| 96 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 97 | + Row("SCHEMA_ENGINE", ablate="SCHEMA_ENGINE", |
| 98 | + with_consolidation=True, anchor="BASELINE_WITH_CONSOLIDATION"), |
| 99 | +] |
| 100 | + |
| 101 | + |
| 102 | +def _git_sha() -> str: |
| 103 | + out = subprocess.run( |
| 104 | + ["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=_ROOT |
| 105 | + ) |
| 106 | + return out.stdout.strip() |
| 107 | + |
| 108 | + |
| 109 | +def _git_dirty() -> bool: |
| 110 | + out = subprocess.run( |
| 111 | + ["git", "diff", "--stat", "--ignore-submodules=all", "HEAD"], |
| 112 | + capture_output=True, |
| 113 | + text=True, |
| 114 | + cwd=_ROOT, |
| 115 | + ) |
| 116 | + return bool(out.stdout.strip()) |
| 117 | + |
| 118 | + |
| 119 | +def _run_row(row: Row) -> dict: |
| 120 | + out_path = _OUT_DIR / f"{row.label}.json" |
| 121 | + cmd = [ |
| 122 | + "uv", "run", "python", str(_HARNESS), |
| 123 | + "--results-out", str(out_path), |
| 124 | + ] |
| 125 | + if row.with_consolidation: |
| 126 | + cmd.append("--with-consolidation") |
| 127 | + if row.ablate is not None: |
| 128 | + cmd += ["--ablate", row.ablate] |
| 129 | + |
| 130 | + print(f"\n{'=' * 70}", flush=True) |
| 131 | + print(f"[E1v3-LoCoMo] {row.label} — start {datetime.now(timezone.utc).isoformat()}", |
| 132 | + flush=True) |
| 133 | + print(f"[E1v3-LoCoMo] cmd: {' '.join(cmd)}", flush=True) |
| 134 | + print(f"{'=' * 70}", flush=True) |
| 135 | + |
| 136 | + t0 = time.time() |
| 137 | + proc = subprocess.run(cmd, cwd=_ROOT) |
| 138 | + wall = time.time() - t0 |
| 139 | + |
| 140 | + rc = proc.returncode |
| 141 | + if rc != 0: |
| 142 | + print(f"[E1v3-LoCoMo][ERROR] {row.label} returncode={rc}", |
| 143 | + file=sys.stderr, flush=True) |
| 144 | + |
| 145 | + mrr: float | None = None |
| 146 | + r10: float | None = None |
| 147 | + cat_mrr: dict[str, float] = {} |
| 148 | + cat_r10: dict[str, float] = {} |
| 149 | + if out_path.exists(): |
| 150 | + try: |
| 151 | + data = json.loads(out_path.read_text()) |
| 152 | + mrr = data.get("overall_mrr") |
| 153 | + r10 = data.get("overall_recall10") |
| 154 | + cat_mrr = data.get("category_mrr", {}) or {} |
| 155 | + cat_r10 = data.get("category_recall10", {}) or {} |
| 156 | + except Exception as e: |
| 157 | + print(f"[E1v3-LoCoMo][WARN] {row.label} parse {out_path}: {e}", |
| 158 | + file=sys.stderr, flush=True) |
| 159 | + |
| 160 | + print(f"[E1v3-LoCoMo] {row.label} done — rc={rc} wall={wall:.1f}s " |
| 161 | + f"mrr={mrr} r10={r10}", flush=True) |
| 162 | + return { |
| 163 | + "row": row.label, |
| 164 | + "anchor": row.anchor, |
| 165 | + "with_consolidation": row.with_consolidation, |
| 166 | + "ablate": row.ablate, |
| 167 | + "mrr": mrr, |
| 168 | + "r10": r10, |
| 169 | + "category_mrr": cat_mrr, |
| 170 | + "category_recall10": cat_r10, |
| 171 | + "wall_seconds": wall, |
| 172 | + "returncode": rc, |
| 173 | + } |
| 174 | + |
| 175 | + |
| 176 | +def _write_summary(rows: list[dict]) -> None: |
| 177 | + # Build anchor → (mrr, r10) map. |
| 178 | + anchors: dict[str, tuple[float | None, float | None]] = {} |
| 179 | + for r in rows: |
| 180 | + if r["row"] == r["anchor"]: |
| 181 | + anchors[r["row"]] = (r["mrr"], r["r10"]) |
| 182 | + |
| 183 | + summary_path = _OUT_DIR / "summary.csv" |
| 184 | + with summary_path.open("w", newline="") as f: |
| 185 | + w = csv.writer(f) |
| 186 | + w.writerow([ |
| 187 | + "row", "anchor", "with_consolidation", "ablate", |
| 188 | + "mrr", "r10", "wall_seconds", "returncode", |
| 189 | + "delta_mrr_vs_anchor", "delta_r10_vs_anchor", |
| 190 | + ]) |
| 191 | + for r in rows: |
| 192 | + d_mrr = "" |
| 193 | + d_r10 = "" |
| 194 | + if r["row"] != r["anchor"]: |
| 195 | + a_mrr, a_r10 = anchors.get(r["anchor"], (None, None)) |
| 196 | + if a_mrr is not None and r["mrr"] is not None: |
| 197 | + # positive delta = ablation hurt the metric |
| 198 | + d_mrr = f"{a_mrr - r['mrr']:.4f}" |
| 199 | + if a_r10 is not None and r["r10"] is not None: |
| 200 | + d_r10 = f"{a_r10 - r['r10']:.4f}" |
| 201 | + w.writerow([ |
| 202 | + r["row"], r["anchor"], r["with_consolidation"], |
| 203 | + r["ablate"] or "", |
| 204 | + "" if r["mrr"] is None else f"{r['mrr']:.4f}", |
| 205 | + "" if r["r10"] is None else f"{r['r10']:.4f}", |
| 206 | + f"{r['wall_seconds']:.1f}", |
| 207 | + r["returncode"], d_mrr, d_r10, |
| 208 | + ]) |
| 209 | + print(f"[E1v3-LoCoMo] summary → {summary_path}", flush=True) |
| 210 | + |
| 211 | + |
| 212 | +def main() -> int: |
| 213 | + _OUT_DIR.mkdir(parents=True, exist_ok=True) |
| 214 | + |
| 215 | + sha = _git_sha() |
| 216 | + dirty = _git_dirty() |
| 217 | + started_at = datetime.now(timezone.utc).isoformat() |
| 218 | + |
| 219 | + manifest = { |
| 220 | + "code_hash": sha, |
| 221 | + "dirty": dirty, |
| 222 | + "started_at": started_at, |
| 223 | + "n_rows": len(ROWS), |
| 224 | + "design": "two-baseline", |
| 225 | + "rows_spec": [ |
| 226 | + { |
| 227 | + "label": r.label, |
| 228 | + "ablate": r.ablate, |
| 229 | + "with_consolidation": r.with_consolidation, |
| 230 | + "anchor": r.anchor, |
| 231 | + } |
| 232 | + for r in ROWS |
| 233 | + ], |
| 234 | + "rows": [], |
| 235 | + } |
| 236 | + manifest_path = _OUT_DIR / "manifest.json" |
| 237 | + manifest_path.write_text(json.dumps(manifest, indent=2)) |
| 238 | + |
| 239 | + if dirty: |
| 240 | + print("[E1v3-LoCoMo][FATAL] tree is dirty; refusing to launch.", |
| 241 | + file=sys.stderr, flush=True) |
| 242 | + return 2 |
| 243 | + |
| 244 | + rows: list[dict] = [] |
| 245 | + |
| 246 | + # 1. BASELINE_NO_CONSOLIDATION first — sanity gate vs CLAUDE.md (~0.794 MRR). |
| 247 | + rows.append(_run_row(ROWS[0])) |
| 248 | + if rows[0]["returncode"] != 0 or rows[0]["mrr"] is None: |
| 249 | + print("[E1v3-LoCoMo][FATAL] BASELINE_NO_CONSOLIDATION failed; aborting.", |
| 250 | + file=sys.stderr, flush=True) |
| 251 | + manifest["rows"] = rows |
| 252 | + manifest["aborted"] = "baseline_no_consolidation_failed" |
| 253 | + manifest_path.write_text(json.dumps(manifest, indent=2, default=str)) |
| 254 | + _write_summary(rows) |
| 255 | + return 1 |
| 256 | + |
| 257 | + # CLAUDE.md sanity tolerance: ±0.05 around 0.794. Soft warning if outside. |
| 258 | + bn_mrr = rows[0]["mrr"] |
| 259 | + if abs(bn_mrr - 0.794) > 0.05: |
| 260 | + print( |
| 261 | + f"[E1v3-LoCoMo][WARN] BASELINE_NO_CONSOLIDATION MRR={bn_mrr:.3f} " |
| 262 | + f"deviates >0.05 from CLAUDE.md headline 0.794. Continuing — " |
| 263 | + f"document in writeup.", |
| 264 | + file=sys.stderr, flush=True, |
| 265 | + ) |
| 266 | + |
| 267 | + # 2. Remaining 13 rows. |
| 268 | + for row in ROWS[1:]: |
| 269 | + rows.append(_run_row(row)) |
| 270 | + manifest["rows"] = rows |
| 271 | + manifest["last_completed_at"] = datetime.now(timezone.utc).isoformat() |
| 272 | + manifest_path.write_text(json.dumps(manifest, indent=2, default=str)) |
| 273 | + _write_summary(rows) |
| 274 | + |
| 275 | + manifest["finished_at"] = datetime.now(timezone.utc).isoformat() |
| 276 | + manifest["rows"] = rows |
| 277 | + manifest_path.write_text(json.dumps(manifest, indent=2, default=str)) |
| 278 | + _write_summary(rows) |
| 279 | + |
| 280 | + nonzero = [r for r in rows if r["returncode"] != 0] |
| 281 | + print(f"\n[E1v3-LoCoMo] complete. nonzero rows: {len(nonzero)}", flush=True) |
| 282 | + return 0 if not nonzero else 1 |
| 283 | + |
| 284 | + |
| 285 | +if __name__ == "__main__": |
| 286 | + sys.exit(main()) |
0 commit comments