|
1 | 1 | { |
2 | 2 | "version": "v0.1.5", |
3 | | - "captured_at": "2026-05-03T00:00:00Z", |
| 3 | + "captured_at": "2026-05-04T16:51:15Z", |
| 4 | + "capture_method": "config-emulation", |
| 5 | + "capture_notes": "Captured 2026-05-04 via config-emulation rather than v0.1.5 source checkout. Methodology: current source (v0.3.0a3) with retrieval.reranker=off, no classifier where-filter, no temporal filter, default tuned_hybrid backend. The existing supamem-supamem collection contains v0.3.0a3 payload metadata, but classifier and valid_to fields are unused at query time when the harness calls backend.query() with no where filter. This emulates pre-Phase-8 retrieval semantics. Source eval JSON: ~/.supamem/eval/v0.3.0a3-full-20260504T103341Z.json (470 records, heuristic judge). Phase 10 success criterion gated on -30% tokens_per_correct_answer reduction vs this baseline.", |
4 | 6 | "suite": "longmemeval_s", |
5 | | - "judge": {"kind": "heuristic", "model": "n/a"}, |
| 7 | + "judge": { |
| 8 | + "kind": "heuristic", |
| 9 | + "model": "n/a" |
| 10 | + }, |
| 11 | + "dataset": { |
| 12 | + "name": "longmemeval_s", |
| 13 | + "revision": "98d7416c24c778c2fee6e6f3006e7a073259d48f", |
| 14 | + "n": 470, |
| 15 | + "subset_ids": [] |
| 16 | + }, |
6 | 17 | "scores": { |
7 | | - "recall_at_5": 0.0, |
| 18 | + "recall_at_5": 0.21693683703050703, |
8 | 19 | "context_precision": null, |
9 | 20 | "context_recall": null, |
10 | 21 | "answer_relevance": null, |
11 | | - "tokens_per_correct_answer": 0.0, |
12 | | - "context_compression_ratio": 0.0, |
13 | | - "input_tokens_p50": 0.0, |
14 | | - "input_tokens_p95": 0.0, |
15 | | - "write_cost": 0.0 |
| 22 | + "tokens_per_correct_answer": 1374.5881560283688, |
| 23 | + "context_compression_ratio": 384.6544559994397, |
| 24 | + "input_tokens_p50": 742.0, |
| 25 | + "input_tokens_p95": 920.0, |
| 26 | + "write_cost": 746.9063829787234 |
16 | 27 | }, |
17 | | - "_baseline_pending": true, |
18 | | - "notes": "Pending real capture against the v0.1.5 release tag with heuristic judge. Plan 10-06 (release) replaces these placeholders with measured numbers before tagging. The runner's load_baseline tolerates this stub via the _baseline_pending flag and emits an err_console hint." |
| 28 | + "by_axis": { |
| 29 | + "single_session_user": { |
| 30 | + "recall_at_5": 0.1008008658008658, |
| 31 | + "context_precision": 0.0, |
| 32 | + "context_recall": 0.0, |
| 33 | + "answer_relevance": 0.0, |
| 34 | + "tokens_per_correct_answer": 841.2761904761905, |
| 35 | + "context_compression_ratio": 383.61438435374146, |
| 36 | + "input_tokens_p50": 707.9571428571429, |
| 37 | + "input_tokens_p95": 707.9571428571429, |
| 38 | + "write_cost": 712.7428571428571 |
| 39 | + }, |
| 40 | + "multi_session": { |
| 41 | + "recall_at_5": 0.33530110307941985, |
| 42 | + "context_precision": 0.0, |
| 43 | + "context_recall": 0.0, |
| 44 | + "answer_relevance": 0.0, |
| 45 | + "tokens_per_correct_answer": 1321.5147869674186, |
| 46 | + "context_compression_ratio": 563.5936404757483, |
| 47 | + "input_tokens_p50": 751.5488721804511, |
| 48 | + "input_tokens_p95": 751.5488721804511, |
| 49 | + "write_cost": 757.8195488721805 |
| 50 | + }, |
| 51 | + "temporal_reasoning": { |
| 52 | + "recall_at_5": 0.2174042228892513, |
| 53 | + "context_precision": 0.0, |
| 54 | + "context_recall": 0.0, |
| 55 | + "answer_relevance": 0.0, |
| 56 | + "tokens_per_correct_answer": 1931.9671679197995, |
| 57 | + "context_compression_ratio": 220.12139309953702, |
| 58 | + "input_tokens_p50": 737.4135338345865, |
| 59 | + "input_tokens_p95": 737.4135338345865, |
| 60 | + "write_cost": 748.1203007518797 |
| 61 | + }, |
| 62 | + "knowledge_update": { |
| 63 | + "recall_at_5": 0.2443019943019943, |
| 64 | + "context_precision": 0.0, |
| 65 | + "context_recall": 0.0, |
| 66 | + "answer_relevance": 0.0, |
| 67 | + "tokens_per_correct_answer": 1074.474358974359, |
| 68 | + "context_compression_ratio": 465.1351247517914, |
| 69 | + "input_tokens_p50": 725.1538461538462, |
| 70 | + "input_tokens_p95": 725.1538461538462, |
| 71 | + "write_cost": 730.0128205128206 |
| 72 | + }, |
| 73 | + "single_session_assistant": { |
| 74 | + "recall_at_5": 0.04176587301587302, |
| 75 | + "context_precision": 0.0, |
| 76 | + "context_recall": 0.0, |
| 77 | + "answer_relevance": 0.0, |
| 78 | + "tokens_per_correct_answer": 1261.517857142857, |
| 79 | + "context_compression_ratio": 239.64193247896506, |
| 80 | + "input_tokens_p50": 775.2857142857143, |
| 81 | + "input_tokens_p95": 775.2857142857143, |
| 82 | + "write_cost": 784.3392857142857 |
| 83 | + } |
| 84 | + } |
19 | 85 | } |
0 commit comments