55
66Run:
77 python3 benchmarks/locomo/run_benchmark.py [--limit N] [--verbose]
8+ [--with-consolidation]
9+ [--ablate MECH]
10+ [--results-out PATH]
811"""
912
1013from __future__ import annotations
1114
1215import argparse
16+ import asyncio
17+ import json
1318import os
1419import sys
1520import time
@@ -128,10 +133,42 @@ def print_results(
128133 print (f"Conversations: { n_convs } , Questions: { overall_total } " )
129134
130135
136+ # ── Consolidation pass ────────────────────────────────────────────────────
137+
138+
139+ def _run_consolidation_pass () -> float :
140+ """Invoke the production consolidate handler once. Returns wall seconds.
141+
142+ Precondition: corpus already loaded into PG via BenchmarkDB; CORTEX_ABLATE_*
143+ env vars (if any) already set so the ablation guard fires inside the
144+ handler's stages.
145+ Postcondition: returns elapsed seconds.
146+
147+ For LoCoMo this is invoked ONCE per conversation, after all sessions for
148+ that conversation are loaded and BEFORE QA evaluation — exercising
149+ consolidation across the just-loaded session set. Mirrors the LME-S
150+ "after load, before recall" pattern at the conversation grain (since
151+ LoCoMo evaluates a conversation's QA against its own multi-session
152+ haystack, not across conversations).
153+ """
154+ from mcp_server .handlers import consolidate as consolidate_handler
155+
156+ t0 = time .monotonic ()
157+ asyncio .run (consolidate_handler .handler ({}))
158+ return time .monotonic () - t0
159+
160+
131161# ── Main ─────────────────────────────────────────────────────────────────
132162
133163
134- def run_benchmark (data_path : str , limit : int | None = None , verbose : bool = False ):
164+ def run_benchmark (
165+ data_path : str ,
166+ limit : int | None = None ,
167+ verbose : bool = False ,
168+ * ,
169+ with_consolidation : bool = False ,
170+ ablate_mechanism : str | None = None ,
171+ ) -> dict :
135172 data = load_locomo (data_path )
136173 if limit :
137174 data = data [:limit ]
@@ -140,9 +177,18 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
140177 f"Running benchmark on { len (data )} conversations, "
141178 f"{ sum (len (c ['qa' ]) for c in data )} QA pairs (PostgreSQL backend)..."
142179 )
180+ if with_consolidation :
181+ print (
182+ " consolidation: ON (between session-load and QA, per conversation)"
183+ )
184+ if ablate_mechanism :
185+ print (f" ablation: CORTEX_ABLATE_{ ablate_mechanism } =1" )
186+ print ()
143187
144188 all_results : dict [str , list [dict ]] = defaultdict (list )
145189 total_start = time .time ()
190+ consolidation_total_wall_s = 0.0
191+ consolidation_call_count = 0
146192
147193 with BenchmarkDB () as db :
148194 for conv_idx , conv in enumerate (data ):
@@ -162,6 +208,17 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
162208 ]
163209 mem_ids , source_map = db .load_memories (memories , domain = "locomo" )
164210
211+ # Consolidation pass between session-load and QA. Off by default to
212+ # preserve historical reproducibility. ON exercises the
213+ # consolidation-only mechanisms (CASCADE, INTERFERENCE,
214+ # HOMEOSTATIC_PLASTICITY, SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING,
215+ # TWO_STAGE_MODEL, EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE,
216+ # SCHEMA_ENGINE) so per-mechanism ablation deltas become
217+ # attributable on the longitudinal benchmark.
218+ if with_consolidation :
219+ consolidation_total_wall_s += _run_consolidation_pass ()
220+ consolidation_call_count += 1
221+
165222 conv_results = evaluate_conversation (
166223 db , sessions , mem_ids , source_map , conv ["qa" ]
167224 )
@@ -174,21 +231,115 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
174231 f"({ time .time () - total_start :.1f} s)"
175232 )
176233
177- print_results (all_results , time .time () - total_start , len (data ))
234+ elapsed = time .time () - total_start
235+ print_results (all_results , elapsed , len (data ))
178236
179237 if verbose :
180238 print ("\n Missed questions (no hit in top 10):" )
181239 for cat , rs in all_results .items ():
182240 for m in [r for r in rs if not r ["hit_rank" ] or r ["hit_rank" ] > 10 ][:3 ]:
183241 print (f" [{ cat } ] { m ['question' ][:80 ]} " )
184242
243+ # Aggregate metrics for results-out / driver consumption.
244+ overall_mrr_sum = 0.0
245+ overall_r10 = 0
246+ overall_total = 0
247+ category_mrr : dict [str , float ] = {}
248+ category_recall10 : dict [str , float ] = {}
249+ for cat , rs in all_results .items ():
250+ if not rs :
251+ continue
252+ mrr_sum = sum (1.0 / r ["hit_rank" ] for r in rs if r ["hit_rank" ])
253+ r10 = sum (1 for r in rs if r ["hit_rank" ] and r ["hit_rank" ] <= 10 )
254+ n = len (rs )
255+ category_mrr [cat ] = mrr_sum / n
256+ category_recall10 [cat ] = r10 / n
257+ overall_mrr_sum += mrr_sum
258+ overall_r10 += r10
259+ overall_total += n
260+
261+ overall_mrr = overall_mrr_sum / overall_total if overall_total else 0.0
262+ overall_recall10 = overall_r10 / overall_total if overall_total else 0.0
263+
264+ if with_consolidation :
265+ avg_ms = (
266+ consolidation_total_wall_s / consolidation_call_count * 1000
267+ if consolidation_call_count
268+ else 0.0
269+ )
270+ print (
271+ f"Consolidation: { consolidation_call_count } calls, "
272+ f"total { consolidation_total_wall_s :.1f} s "
273+ f"(avg { avg_ms :.1f} ms/call) — excluded from per-question stats"
274+ )
275+
276+ manifest = {
277+ "with_consolidation" : with_consolidation ,
278+ "ablate_mechanism" : ablate_mechanism ,
279+ "ablate_env_var" : (
280+ f"CORTEX_ABLATE_{ ablate_mechanism } =1" if ablate_mechanism else None
281+ ),
282+ "n_conversations" : len (data ),
283+ "n_questions" : overall_total ,
284+ "consolidation_call_count" : consolidation_call_count ,
285+ "consolidation_total_wall_s" : consolidation_total_wall_s ,
286+ }
287+
288+ return {
289+ "overall_mrr" : overall_mrr ,
290+ "overall_recall10" : overall_recall10 ,
291+ "category_mrr" : category_mrr ,
292+ "category_recall10" : category_recall10 ,
293+ "elapsed_s" : elapsed ,
294+ "consolidation_total_wall_s" : consolidation_total_wall_s ,
295+ "consolidation_call_count" : consolidation_call_count ,
296+ "manifest" : manifest ,
297+ }
298+
185299
186300if __name__ == "__main__" :
187301 parser = argparse .ArgumentParser (description = "LoCoMo benchmark for Cortex" )
188302 parser .add_argument ("--limit" , type = int , help = "Limit conversations" )
189303 parser .add_argument ("--verbose" , action = "store_true" , help = "Show misses" )
304+ parser .add_argument (
305+ "--with-consolidation" ,
306+ action = "store_true" ,
307+ help = (
308+ "After loading each conversation's sessions and BEFORE QA, invoke "
309+ "the production consolidate handler so consolidation-only "
310+ "mechanisms (CASCADE, INTERFERENCE, HOMEOSTATIC_PLASTICITY, "
311+ "SYNAPTIC_PLASTICITY, MICROGLIAL_PRUNING, TWO_STAGE_MODEL, "
312+ "EMOTIONAL_DECAY, TRIPARTITE_SYNAPSE, SCHEMA_ENGINE) are "
313+ "exercised. Required for honest per-mechanism ablation."
314+ ),
315+ )
316+ parser .add_argument (
317+ "--ablate" ,
318+ type = str ,
319+ default = None ,
320+ metavar = "MECH" ,
321+ help = (
322+ "Set CORTEX_ABLATE_<MECH>=1 BEFORE consolidation and recall. "
323+ "MECH is the Mechanism enum NAME (e.g. CASCADE, RECONSOLIDATION, "
324+ "CO_ACTIVATION, ADAPTIVE_DECAY)."
325+ ),
326+ )
327+ parser .add_argument (
328+ "--results-out" ,
329+ type = str ,
330+ default = None ,
331+ help = "Optional path to write the result+manifest JSON." ,
332+ )
190333 args = parser .parse_args ()
191334
335+ # Export ablation env var BEFORE any handler/store import touches it. The
336+ # ablation.is_disabled reads os.environ on every call, so setting it here
337+ # is sufficient as long as we do it before run_benchmark.
338+ ablate_mech : str | None = None
339+ if args .ablate :
340+ ablate_mech = args .ablate .strip ().upper ()
341+ os .environ [f"CORTEX_ABLATE_{ ablate_mech } " ] = "1"
342+
192343 data_dir = Path (__file__ ).parent
193344 data_path = data_dir / "locomo10.json"
194345 if not data_path .exists ():
@@ -199,4 +350,17 @@ def run_benchmark(data_path: str, limit: int | None = None, verbose: bool = Fals
199350 )
200351 sys .exit (1 )
201352
202- run_benchmark (str (data_path ), limit = args .limit , verbose = args .verbose )
353+ results = run_benchmark (
354+ str (data_path ),
355+ limit = args .limit ,
356+ verbose = args .verbose ,
357+ with_consolidation = args .with_consolidation ,
358+ ablate_mechanism = ablate_mech ,
359+ )
360+
361+ if args .results_out :
362+ out_path = Path (args .results_out )
363+ out_path .parent .mkdir (parents = True , exist_ok = True )
364+ with open (out_path , "w" ) as f :
365+ json .dump (results , f , indent = 2 , default = str )
366+ print (f"Results written to { out_path } " )
0 commit comments