Skip to content

Commit ab0a0d5

Browse files
committed
eval: capture v0.1.5 baseline via config-emulation
Replaces the placeholder v0.1.5.json (all-zeros, _baseline_pending=true) with measured numbers so the bench harness can compute real baseline_delta values. Methodology — config-emulation rather than v0.1.5 source checkout: the runner calls TunedHybridBackend.query() with no where-filter, so Phase 7 classifier, Phase 9 valid_to, and Phase 11 filtered_dense levers don't fire at query time regardless of the indexer-side payload schema. Running the current source with retrieval.reranker=off therefore reproduces v0.1.5 retrieval semantics on the existing collection, and a checkout dance is unnecessary. Source measurement: ~/.supamem/eval/v0.3.0a3-full-20260504T103341Z.json (470 records, heuristic judge, full LongMemEval_S). The capture_method and capture_notes fields document this in the baseline JSON itself so future delta claims can cite the methodology. Headline finding (now visible in baseline_delta): current default v0.3.0a3 stack (rerank-on) vs this baseline: tokens_per_correct_answer 1374.59 -> 1510.34 (+9.9% regression) The milestone gate (-30% reduction) is currently failing by a wide margin; Phase 13 is formally blocked per the ROADMAP gating rule.
1 parent 18772e5 commit ab0a0d5

1 file changed

Lines changed: 76 additions & 10 deletions

File tree

Lines changed: 76 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,85 @@
11
{
22
"version": "v0.1.5",
3-
"captured_at": "2026-05-03T00:00:00Z",
3+
"captured_at": "2026-05-04T16:51:15Z",
4+
"capture_method": "config-emulation",
5+
"capture_notes": "Captured 2026-05-04 via config-emulation rather than v0.1.5 source checkout. Methodology: current source (v0.3.0a3) with retrieval.reranker=off, no classifier where-filter, no temporal filter, default tuned_hybrid backend. The existing supamem-supamem collection contains v0.3.0a3 payload metadata, but classifier and valid_to fields are unused at query time when the harness calls backend.query() with no where filter. This emulates pre-Phase-8 retrieval semantics. Source eval JSON: ~/.supamem/eval/v0.3.0a3-full-20260504T103341Z.json (470 records, heuristic judge). Phase 10 success criterion gated on -30% tokens_per_correct_answer reduction vs this baseline.",
46
"suite": "longmemeval_s",
5-
"judge": {"kind": "heuristic", "model": "n/a"},
7+
"judge": {
8+
"kind": "heuristic",
9+
"model": "n/a"
10+
},
11+
"dataset": {
12+
"name": "longmemeval_s",
13+
"revision": "98d7416c24c778c2fee6e6f3006e7a073259d48f",
14+
"n": 470,
15+
"subset_ids": []
16+
},
617
"scores": {
7-
"recall_at_5": 0.0,
18+
"recall_at_5": 0.21693683703050703,
819
"context_precision": null,
920
"context_recall": null,
1021
"answer_relevance": null,
11-
"tokens_per_correct_answer": 0.0,
12-
"context_compression_ratio": 0.0,
13-
"input_tokens_p50": 0.0,
14-
"input_tokens_p95": 0.0,
15-
"write_cost": 0.0
22+
"tokens_per_correct_answer": 1374.5881560283688,
23+
"context_compression_ratio": 384.6544559994397,
24+
"input_tokens_p50": 742.0,
25+
"input_tokens_p95": 920.0,
26+
"write_cost": 746.9063829787234
1627
},
17-
"_baseline_pending": true,
18-
"notes": "Pending real capture against the v0.1.5 release tag with heuristic judge. Plan 10-06 (release) replaces these placeholders with measured numbers before tagging. The runner's load_baseline tolerates this stub via the _baseline_pending flag and emits an err_console hint."
28+
"by_axis": {
29+
"single_session_user": {
30+
"recall_at_5": 0.1008008658008658,
31+
"context_precision": 0.0,
32+
"context_recall": 0.0,
33+
"answer_relevance": 0.0,
34+
"tokens_per_correct_answer": 841.2761904761905,
35+
"context_compression_ratio": 383.61438435374146,
36+
"input_tokens_p50": 707.9571428571429,
37+
"input_tokens_p95": 707.9571428571429,
38+
"write_cost": 712.7428571428571
39+
},
40+
"multi_session": {
41+
"recall_at_5": 0.33530110307941985,
42+
"context_precision": 0.0,
43+
"context_recall": 0.0,
44+
"answer_relevance": 0.0,
45+
"tokens_per_correct_answer": 1321.5147869674186,
46+
"context_compression_ratio": 563.5936404757483,
47+
"input_tokens_p50": 751.5488721804511,
48+
"input_tokens_p95": 751.5488721804511,
49+
"write_cost": 757.8195488721805
50+
},
51+
"temporal_reasoning": {
52+
"recall_at_5": 0.2174042228892513,
53+
"context_precision": 0.0,
54+
"context_recall": 0.0,
55+
"answer_relevance": 0.0,
56+
"tokens_per_correct_answer": 1931.9671679197995,
57+
"context_compression_ratio": 220.12139309953702,
58+
"input_tokens_p50": 737.4135338345865,
59+
"input_tokens_p95": 737.4135338345865,
60+
"write_cost": 748.1203007518797
61+
},
62+
"knowledge_update": {
63+
"recall_at_5": 0.2443019943019943,
64+
"context_precision": 0.0,
65+
"context_recall": 0.0,
66+
"answer_relevance": 0.0,
67+
"tokens_per_correct_answer": 1074.474358974359,
68+
"context_compression_ratio": 465.1351247517914,
69+
"input_tokens_p50": 725.1538461538462,
70+
"input_tokens_p95": 725.1538461538462,
71+
"write_cost": 730.0128205128206
72+
},
73+
"single_session_assistant": {
74+
"recall_at_5": 0.04176587301587302,
75+
"context_precision": 0.0,
76+
"context_recall": 0.0,
77+
"answer_relevance": 0.0,
78+
"tokens_per_correct_answer": 1261.517857142857,
79+
"context_compression_ratio": 239.64193247896506,
80+
"input_tokens_p50": 775.2857142857143,
81+
"input_tokens_p95": 775.2857142857143,
82+
"write_cost": 784.3392857142857
83+
}
84+
}
1985
}

0 commit comments

Comments
 (0)