Skip to content

Commit b68c5ac

Browse files
cdeustclaude
andcommitted
feat(verif): --ablate + --with-consolidation flags for LoCoMo harness
Mirror the LME-S harness signature (commit 0e1f90d) onto LoCoMo: - --ablate <MECH> exports CORTEX_ABLATE_<MECH>=1 before benchmark loader - --with-consolidation invokes consolidate() once per conversation, after db.load_memories() and before the QA loop. Mirrors LME-S's per-question insertion at the conversation grain (LoCoMo conversations are self-contained multi-session units; db.clear() between conversations precludes cross-conversation state). - --results-out writes manifest block (with_consolidation, ablate_mechanism, ablate_env_var, n_conversations, n_questions, consolidation_call_count, consolidation_total_wall_s). Smoke (n=1 conversation): - no consolidation: MRR=0.866, R@10=99.0%, wall=222s - with consolidation: MRR=0.222, R@10=54.8%, wall=176s + 128s consolidation Smoke surfaced an architectural collision documented in tasks/e1-v3-locomo-smoke-finding.md: LoCoMo timestamps from 2023 collide with wall-clock-relative consolidation gates (compression at age >7d). The 14-row sweep (next commit) routes mechanisms to two distinct baselines. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c5ade6b commit b68c5ac

1 file changed

Lines changed: 167 additions & 3 deletions

File tree

benchmarks/locomo/run_benchmark.py

Lines changed: 167 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,16 @@
55
66
Run:
77
python3 benchmarks/locomo/run_benchmark.py [--limit N] [--verbose]
8+
[--with-consolidation]
9+
[--ablate MECH]
10+
[--results-out PATH]
811
"""
912

1013
from __future__ import annotations
1114

1215
import argparse
16+
import asyncio
17+
import json
1318
import os
1419
import sys
1520
import time
@@ -128,10 +133,42 @@ def print_results(
128133
print(f"Conversations: {n_convs}, Questions: {overall_total}")
129134

130135

136+
# ── Consolidation pass ────────────────────────────────────────────────────
137+
138+
139+
def _run_consolidation_pass() -> float:
140+
"""Invoke the production consolidate handler once. Returns wall seconds.
141+
142+
Precondition: corpus already loaded into PG via BenchmarkDB; CORTEX_ABLATE_*
143+
env vars (if any) already set so the ablation guard fires inside the
144+
handler's stages.
145+
Postcondition: returns elapsed seconds.
146+
147+
For LoCoMo this is invoked ONCE per conversation, after all sessions for
148+
that conversation are loaded and BEFORE QA evaluation — exercising
149+
consolidation across the just-loaded session set. Mirrors the LME-S
150+
"after load, before recall" pattern at the conversation grain (since
151+
LoCoMo evaluates a conversation's QA against its own multi-session
152+
haystack, not across conversations).
153+
"""
154+
from mcp_server.handlers import consolidate as consolidate_handler
155+
156+
t0 = time.monotonic()
157+
asyncio.run(consolidate_handler.handler({}))
158+
return time.monotonic() - t0
159+
160+
131161
# ── Main ─────────────────────────────────────────────────────────────────
132162

133163

134-
def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = False):
164+
def run_benchmark(
165+
data_path: str,
166+
limit: int | None = None,
167+
verbose: bool = False,
168+
*,
169+
with_consolidation: bool = False,
170+
ablate_mechanism: str | None = None,
171+
) -> dict:
135172
data = load_locomo(data_path)
136173
if limit:
137174
data = data[:limit]
@@ -140,9 +177,18 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
140177
f"Running benchmark on {len(data)} conversations, "
141178
f"{sum(len(c['qa']) for c in data)} QA pairs (PostgreSQL backend)..."
142179
)
180+
if with_consolidation:
181+
print(
182+
" consolidation: ON (between session-load and QA, per conversation)"
183+
)
184+
if ablate_mechanism:
185+
print(f" ablation: CORTEX_ABLATE_{ablate_mechanism}=1")
186+
print()
143187

144188
all_results: dict[str, list[dict]] = defaultdict(list)
145189
total_start = time.time()
190+
consolidation_total_wall_s = 0.0
191+
consolidation_call_count = 0
146192

147193
with BenchmarkDB() as db:
148194
for conv_idx, conv in enumerate(data):
@@ -162,6 +208,17 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
162208
]
163209
mem_ids, source_map = db.load_memories(memories, domain="locomo")
164210

211+
# Consolidation pass between session-load and QA. Off by default to
212+
# preserve historical reproducibility. ON exercises the
213+
# consolidation-only mechanisms (CASCADE, INTERFERENCE,
214+
# HOMEOSTATIC_PLASTICITY, SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING,
215+
# TWO_STAGE_MODEL, EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE,
216+
# SCHEMA_ENGINE) so per-mechanism ablation deltas become
217+
# attributable on the longitudinal benchmark.
218+
if with_consolidation:
219+
consolidation_total_wall_s += _run_consolidation_pass()
220+
consolidation_call_count += 1
221+
165222
conv_results = evaluate_conversation(
166223
db, sessions, mem_ids, source_map, conv["qa"]
167224
)
@@ -174,21 +231,115 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
174231
f"({time.time() - total_start:.1f}s)"
175232
)
176233

177-
print_results(all_results, time.time() - total_start, len(data))
234+
elapsed = time.time() - total_start
235+
print_results(all_results, elapsed, len(data))
178236

179237
if verbose:
180238
print("\nMissed questions (no hit in top 10):")
181239
for cat, rs in all_results.items():
182240
for m in [r for r in rs if not r["hit_rank"] or r["hit_rank"] > 10][:3]:
183241
print(f" [{cat}] {m['question'][:80]}")
184242

243+
# Aggregate metrics for results-out / driver consumption.
244+
overall_mrr_sum = 0.0
245+
overall_r10 = 0
246+
overall_total = 0
247+
category_mrr: dict[str, float] = {}
248+
category_recall10: dict[str, float] = {}
249+
for cat, rs in all_results.items():
250+
if not rs:
251+
continue
252+
mrr_sum = sum(1.0 / r["hit_rank"] for r in rs if r["hit_rank"])
253+
r10 = sum(1 for r in rs if r["hit_rank"] and r["hit_rank"] <= 10)
254+
n = len(rs)
255+
category_mrr[cat] = mrr_sum / n
256+
category_recall10[cat] = r10 / n
257+
overall_mrr_sum += mrr_sum
258+
overall_r10 += r10
259+
overall_total += n
260+
261+
overall_mrr = overall_mrr_sum / overall_total if overall_total else 0.0
262+
overall_recall10 = overall_r10 / overall_total if overall_total else 0.0
263+
264+
if with_consolidation:
265+
avg_ms = (
266+
consolidation_total_wall_s / consolidation_call_count * 1000
267+
if consolidation_call_count
268+
else 0.0
269+
)
270+
print(
271+
f"Consolidation: {consolidation_call_count} calls, "
272+
f"total {consolidation_total_wall_s:.1f}s "
273+
f"(avg {avg_ms:.1f}ms/call) — excluded from per-question stats"
274+
)
275+
276+
manifest = {
277+
"with_consolidation": with_consolidation,
278+
"ablate_mechanism": ablate_mechanism,
279+
"ablate_env_var": (
280+
f"CORTEX_ABLATE_{ablate_mechanism}=1" if ablate_mechanism else None
281+
),
282+
"n_conversations": len(data),
283+
"n_questions": overall_total,
284+
"consolidation_call_count": consolidation_call_count,
285+
"consolidation_total_wall_s": consolidation_total_wall_s,
286+
}
287+
288+
return {
289+
"overall_mrr": overall_mrr,
290+
"overall_recall10": overall_recall10,
291+
"category_mrr": category_mrr,
292+
"category_recall10": category_recall10,
293+
"elapsed_s": elapsed,
294+
"consolidation_total_wall_s": consolidation_total_wall_s,
295+
"consolidation_call_count": consolidation_call_count,
296+
"manifest": manifest,
297+
}
298+
185299

186300
if __name__ == "__main__":
187301
parser = argparse.ArgumentParser(description="LoCoMo benchmark for Cortex")
188302
parser.add_argument("--limit", type=int, help="Limit conversations")
189303
parser.add_argument("--verbose", action="store_true", help="Show misses")
304+
parser.add_argument(
305+
"--with-consolidation",
306+
action="store_true",
307+
help=(
308+
"After loading each conversation's sessions and BEFORE QA, invoke "
309+
"the production consolidate handler so consolidation-only "
310+
"mechanisms (CASCADE, INTERFERENCE, HOMEOSTATIC_PLASTICITY, "
311+
"SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING, TWO_STAGE_MODEL, "
312+
"EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE, SCHEMA_ENGINE) are "
313+
"exercised. Required for honest per-mechanism ablation."
314+
),
315+
)
316+
parser.add_argument(
317+
"--ablate",
318+
type=str,
319+
default=None,
320+
metavar="MECH",
321+
help=(
322+
"Set CORTEX_ABLATE_<MECH>=1 BEFORE consolidation and recall. "
323+
"MECH is the Mechanism enum NAME (e.g. CASCADE, RECONSOLIDATION, "
324+
"CO_ACTIVATION, ADAPTIVE_DECAY)."
325+
),
326+
)
327+
parser.add_argument(
328+
"--results-out",
329+
type=str,
330+
default=None,
331+
help="Optional path to write the result+manifest JSON.",
332+
)
190333
args = parser.parse_args()
191334

335+
# Export ablation env var BEFORE any handler/store import touches it. The
336+
# ablation.is_disabled reads os.environ on every call, so setting it here
337+
# is sufficient as long as we do it before run_benchmark.
338+
ablate_mech: str | None = None
339+
if args.ablate:
340+
ablate_mech = args.ablate.strip().upper()
341+
os.environ[f"CORTEX_ABLATE_{ablate_mech}"] = "1"
342+
192343
data_dir = Path(__file__).parent
193344
data_path = data_dir / "locomo10.json"
194345
if not data_path.exists():
@@ -199,4 +350,17 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
199350
)
200351
sys.exit(1)
201352

202-
run_benchmark(str(data_path), limit=args.limit, verbose=args.verbose)
353+
results = run_benchmark(
354+
str(data_path),
355+
limit=args.limit,
356+
verbose=args.verbose,
357+
with_consolidation=args.with_consolidation,
358+
ablate_mechanism=ablate_mech,
359+
)
360+
361+
if args.results_out:
362+
out_path = Path(args.results_out)
363+
out_path.parent.mkdir(parents=True, exist_ok=True)
364+
with open(out_path, "w") as f:
365+
json.dump(results, f, indent=2, default=str)
366+
print(f"Results written to {out_path}")

0 commit comments

Comments
 (0)