synaptic-memory/eval/run_all.py at main · PlateerLab/synaptic-memory · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Unified QA benchmark runner — run after every development cycle.

Runs all evaluation datasets (custom + public) through the synaptic
pipeline and produces a regression-aware comparison table.

Usage::

    # Full run (all datasets)
    uv run python eval/run_all.py

    # Quick run (custom only, skip large public datasets)
    uv run python eval/run_all.py --quick

    # Compare against last baseline
    uv run python eval/run_all.py --compare eval/results/baseline.json

Output::

    ┌──────────────────┬────────┬───────┬───────┬───────┬──────────┐
    │ Dataset          │ Corpus │  MRR  │ P@10  │ R@10  │ Status   │
    ├──────────────────┼────────┼───────┼───────┼───────┼──────────┤
    │ KRRA Easy        │ 19,720 │ 0.967 │ 0.496 │ 0.914 │ ✅       │
    │ KRRA Hard        │ 19,720 │ 0.507 │ 0.157 │ 0.633 │ ✅       │
    │ assort Easy      │ 13,909 │ 0.880 │ 0.100 │ 0.933 │ ✅       │
    │ assort Hard      │ 13,909 │ 0.127 │ 0.047 │ 0.267 │ ✅       │
    │ HotPotQA-200     │  1,990 │ 0.742 │       │       │ NEW      │
    │ Ko-StrategyQA    │  9,251 │ 0.317 │       │       │ NEW      │
    │ ...              │        │       │       │       │          │
    └──────────────────┴────────┴───────┴───────┴───────┴──────────┘
"""

from __future__ import annotations

import argparse
import asyncio
import json
import os
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT))

from datetime import UTC

from synaptic.backends.memory import MemoryBackend
from synaptic.graph import SynapticGraph
from tests.benchmark.metrics import BenchmarkResult

# --- Dataset registry ---

BENCHMARK_DIR = REPO_ROOT / "tests" / "benchmark" / "data"
EVAL_DIR = REPO_ROOT / "eval"
RESULTS_DIR = EVAL_DIR / "results"


@dataclass
class DatasetConfig:
    name: str
    path: Path
    query_path: Path | None = None  # None = queries embedded in dataset
    corpus_key: str = "corpus"
    query_key: str = "queries"
    doc_id_key: str = "doc_id"
    text_key: str = "text"
    title_key: str = "title"
    k: int = 10
    is_custom: bool = False  # custom = KRRA/assort, not public
    quick: bool = True  # include in --quick mode


# Custom datasets (KRRA, assort)
CUSTOM_DATASETS = [
    DatasetConfig(
        name="KRRA Easy",
        path=EVAL_DIR / "data" / "krra_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "krra.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="KRRA Hard",
        path=EVAL_DIR / "data" / "krra_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "krra_hard.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="assort Easy",
        path=EVAL_DIR / "data" / "assort_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "assort.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="assort Hard",
        path=EVAL_DIR / "data" / "assort_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "assort_hard.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="X2BEE Easy",
        path=EVAL_DIR / "data" / "x2bee_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "x2bee.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="X2BEE Hard",
        path=EVAL_DIR / "data" / "x2bee_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "x2bee_hard.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="KRRA Conv",
        path=EVAL_DIR / "data" / "krra_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "krra_conversational.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="assort Conv",
        path=EVAL_DIR / "data" / "assort_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "assort_conversational.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="X2BEE Conv",
        path=EVAL_DIR / "data" / "x2bee_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "x2bee_conversational.json",
        is_custom=True,
        quick=True,
    ),
]

# Public datasets (in-memory, from benchmark JSON)
PUBLIC_DATASETS = [
    DatasetConfig(name="HotPotQA-24", path=BENCHMARK_DIR / "hotpotqa_24.json", quick=True),
    DatasetConfig(name="HotPotQA-200", path=BENCHMARK_DIR / "hotpotqa.json", quick=False),
    DatasetConfig(
        name="Allganize RAG-ko", path=BENCHMARK_DIR / "allganize_rag_ko.json", quick=True
    ),
    DatasetConfig(
        name="Allganize RAG-Eval", path=BENCHMARK_DIR / "allganize_rag_eval.json", quick=True
    ),
    DatasetConfig(name="PublicHealthQA", path=BENCHMARK_DIR / "publichealthqa_ko.json", quick=True),
    DatasetConfig(name="AutoRAG", path=BENCHMARK_DIR / "autorag_retrieval.json", quick=True),
    DatasetConfig(name="KLUE-MRC", path=BENCHMARK_DIR / "klue_mrc.json", quick=False),
    DatasetConfig(name="Ko-StrategyQA", path=BENCHMARK_DIR / "ko_strategyqa.json", quick=False),
]


@dataclass
class RunResult:
    name: str
    corpus_size: int = 0
    mrr: float = 0.0
    p_at_k: float = 0.0
    r_at_k: float = 0.0
    ndcg: float = 0.0
    hit_rate: str = ""
    elapsed: float = 0.0
    error: str | None = None


# --- Custom dataset runner (SQLite graph) ---


async def run_custom_dataset(
    cfg: DatasetConfig,
    embed_url: str | None = None,
    embed_model: str = "qwen3-embedding:4b",
    reranker_url: str | None = None,
    use_flashrank: bool = False,
) -> RunResult:
    """Run a custom dataset against its pre-built SQLite graph.

    When embed_url is provided, uses EvidenceSearch with vector cascade.
    When reranker_url is provided, adds cross-encoder reranking.
    """
    if not cfg.path.exists():
        return RunResult(name=cfg.name, error="graph not found")
    if not cfg.query_path or not cfg.query_path.exists():
        return RunResult(name=cfg.name, error="queries not found")

    from synaptic.backends.sqlite_graph import SqliteGraphBackend

    backend = SqliteGraphBackend(str(cfg.path))
    await backend.connect()

    with open(cfg.query_path, encoding="utf-8") as f:
        gt = json.load(f)
    queries = gt.get("queries", [])
    id_field = gt.get("id_field", "doc_id")

    # Build searcher — with optional embedding + reranker
    embedder = None
    if embed_url:
        from synaptic.extensions.embedder import OpenAIEmbeddingProvider

        embedder = OpenAIEmbeddingProvider(api_base=embed_url, model=embed_model)

    reranker = None
    # FlashRank is English-only (ms-marco trained). For Korean datasets
    # use TEI with bge-reranker-v2-m3 instead.
    if reranker_url:
        from synaptic.extensions.reranker_cross import TEIReranker

        reranker = TEIReranker(base_url=reranker_url)

    from synaptic.extensions.evidence_search import EvidenceSearch

    searcher = EvidenceSearch(backend=backend, embedder=embedder, reranker=reranker)

    bench = BenchmarkResult()
    t0 = time.time()

    for q in queries:
        qid = q.get("qid", "")
        query_text = q.get("query", "")
        relevant = set(q.get("relevant_docs", []))
        if not relevant:
            continue

        result = await searcher.search(query_text, k=cfg.k * 2, fts_seed_limit=30)

        if id_field == "node_title":
            retrieved = []
            for ev in result.evidence:
                title = ev.node.title
                if title and title not in retrieved:
                    retrieved.append(title)
        else:
            retrieved = []
            for ev in result.evidence:
                doc_id = ev.document_id or (ev.node.properties or {}).get("doc_id", "")
                if doc_id and doc_id not in retrieved:
                    retrieved.append(doc_id)

        bench.add(
            query_id=qid,
            query=query_text,
            retrieved=retrieved[: cfg.k],
            relevant=relevant,
            k=cfg.k,
        )

    elapsed = time.time() - t0
    await backend.close()

    summary = bench.summary()
    total = len(queries)
    hits = sum(1 for q in bench.queries if q.get("mrr", 0) > 0)

    return RunResult(
        name=cfg.name,
        corpus_size=total,
        mrr=summary.get("mrr", 0),
        p_at_k=summary.get("mean_precision@k", 0),
        r_at_k=summary.get("mean_recall@k", 0),
        ndcg=summary.get("mean_ndcg@k", 0),
        hit_rate=f"{hits}/{total}",
        elapsed=elapsed,
    )


# --- Public dataset runner (in-memory) ---


async def run_public_dataset(
    cfg: DatasetConfig,
    embed_url: str | None = None,
    embed_model: str = "qwen3-embedding:4b",
    reranker_url: str | None = None,
) -> RunResult:
    """Run a public benchmark dataset — full pipeline: ingest → index → search.

    Uses MemoryBackend for speed (no disk I/O). The graph.add() path
    exercises the same NFC normalization, FTS indexing, and search
    pipeline as production SQLite/Kuzu backends. When embed_url is
    provided, uses EvidenceSearch with vector cascade.
    """
    if not cfg.path.exists():
        return RunResult(name=cfg.name, error="file not found")

    with open(cfg.path, encoding="utf-8") as f:
        data = json.load(f)

    raw_corpus = data.get("corpus", data.get("documents", []))
    queries = data.get("queries", [])
    if not raw_corpus or not queries:
        return RunResult(name=cfg.name, error="empty dataset")

    # Normalize corpus to list of (doc_id, title, text)
    corpus: list[tuple[str, str, str]] = []
    if isinstance(raw_corpus, dict):
        for doc_id, doc in raw_corpus.items():
            if isinstance(doc, dict):
                corpus.append((str(doc_id), str(doc.get("title", "")), str(doc.get("text", ""))))
            elif isinstance(doc, str):
                corpus.append((str(doc_id), "", doc))
    elif isinstance(raw_corpus, list):
        for doc in raw_corpus:
            if isinstance(doc, dict):
                doc_id = str(doc.get("doc_id", doc.get("_id", doc.get("id", ""))))
                corpus.append(
                    (
                        doc_id,
                        str(doc.get("title", "")),
                        str(doc.get("text", doc.get("content", ""))),
                    )
                )

    if not corpus:
        return RunResult(name=cfg.name, error="could not parse corpus")

    # Full pipeline: build graph via graph.add()
    backend = MemoryBackend()
    await backend.connect()
    graph = SynapticGraph(backend)

    for doc_id, title, text in corpus:
        if not text and not title:
            continue
        await graph.add(
            title=title or doc_id,
            content=text,
            properties={"doc_id": doc_id},
        )

    # Parse queries — support both list and BEIR dict format
    qrels = data.get("relevant_docs", data.get("qrels", {}))
    query_list: list[tuple[str, str, set[str]]] = []  # (qid, text, relevant_ids)

    if isinstance(queries, dict):
        # BEIR format: queries={qid: text}, relevant_docs={qid: {doc_id: score}}
        for qid, text in queries.items():
            rel = qrels.get(qid, {})
            if isinstance(rel, dict):
                relevant = set(str(k) for k in rel.keys())
            elif isinstance(rel, list):
                relevant = set(str(x) for x in rel)
            else:
                continue
            if relevant and text:
                query_list.append((str(qid), str(text), relevant))
    elif isinstance(queries, list):
        for q in queries:
            qid = str(q.get("qid", q.get("query_id", q.get("_id", ""))))
            text = str(q.get("query", q.get("question", "")))
            rel_raw = q.get("relevant_docs", q.get("answer_ids", q.get("positive_doc_ids", [])))
            if isinstance(rel_raw, dict):
                relevant = set(str(k) for k in rel_raw.keys())
            elif isinstance(rel_raw, list):
                relevant = set(str(x) for x in rel_raw)
            else:
                continue
            if relevant and text:
                query_list.append((qid, text, relevant))

    if not query_list:
        return RunResult(name=cfg.name, error="no valid queries")

    # Build searcher — EvidenceSearch when embedder available, else graph.search
    embedder = None
    if embed_url:
        from synaptic.extensions.embedder import OpenAIEmbeddingProvider

        embedder = OpenAIEmbeddingProvider(api_base=embed_url, model=embed_model)

    reranker = None
    if reranker_url:
        from synaptic.extensions.reranker_cross import TEIReranker

        reranker = TEIReranker(base_url=reranker_url)

    use_evidence = embedder is not None or reranker is not None
    searcher = None
    if use_evidence:
        from synaptic.extensions.evidence_search import EvidenceSearch

        searcher = EvidenceSearch(backend=backend, embedder=embedder, reranker=reranker)

    # Search
    bench = BenchmarkResult()
    t0 = time.time()

    for qid, query_text, relevant in query_list:
        if searcher:
            result = await searcher.search(query_text, k=cfg.k * 2, fts_seed_limit=30)
            retrieved = []
            for ev in result.evidence:
                doc_id = ev.document_id or (ev.node.properties or {}).get("doc_id", "")
                if doc_id and doc_id not in retrieved:
                    retrieved.append(doc_id)
        else:
            result = await graph.search(query_text, limit=cfg.k * 2)
            retrieved = []
            for hit in result.nodes:
                doc_id = (hit.node.properties or {}).get("doc_id", "")
                if doc_id and doc_id not in retrieved:
                    retrieved.append(doc_id)

        bench.add(
            query_id=qid,
            query=query_text,
            retrieved=retrieved[: cfg.k],
            relevant=relevant,
            k=cfg.k,
        )

    elapsed = time.time() - t0

    summary = bench.summary()
    total_q = summary.get("total_queries", 0)
    hits = sum(1 for q in bench.queries if q.get("mrr", 0) > 0)

    return RunResult(
        name=cfg.name,
        corpus_size=len(corpus),
        mrr=summary.get("mrr", 0),
        p_at_k=summary.get("mean_precision@k", 0),
        r_at_k=summary.get("mean_recall@k", 0),
        ndcg=summary.get("mean_ndcg@k", 0),
        hit_rate=f"{hits}/{total_q}",
        elapsed=elapsed,
    )


# --- Multi-turn Agent Benchmark ---

AGENT_SYSTEM = """\
You are a research agent. Use the provided tools to answer the question.

## Tool selection (pick the RIGHT one first time)
- Text question → deep_search(query, category="relevant category from metadata")
- Price/date/attribute filter → filter_nodes(table, property, op, value)
- "how many per X" / TOP N → aggregate_nodes(table, group_by, metric)
- "find related records" → join_related(from_value, fk_property, target_table)
- Find by name/text → filter_nodes(table, property=name_column, op="contains", value="keyword")

## Key rules
- Use the exact table and column names from the structured data metadata below
- ALWAYS use category filter when you can identify the topic from metadata
- You can call MULTIPLE tools in ONE turn for efficiency
- Max 15 tool calls total. Be efficient.
- Respond in the same language as the question.

## Fallback when search returns 0 results
1. Try filter_nodes with op="contains" on text columns (e.g., product_name, goods_nm)
2. Try search with shorter/individual keywords from your query
3. Try search with translated terms (Korean ↔ English)

## Structured data patterns
- Node titles = table_name:pk_value (e.g., "products:12800000", "colors:1")
- Use FK relationships from metadata to chain queries across tables
- For cross-table questions: find source → join_related → target table

## Examples
Q: "말 복지 향상 프로그램"
→ deep_search(query="말 복지", category="복지 및 교육")

Q: "50만원 이상 고가 상품"
→ filter_nodes(table="pr_goods_base", property="sales_prc", op=">=", value="500000")

Q: "가장 많이 팔린 상품"
→ aggregate_nodes(table="pr_goods_sold_hist", group_by="goods_no", metric="sum")

Q: "5점 리뷰가 가장 많은 상품"
→ aggregate_nodes(table="feedback", group_by="goods_no", metric="count", where_property="score", where_op="==", where_value="5")

Q: "스마트폰 제품 찾기"
→ filter_nodes(table="pr_goods_base", property="goods_nm", op="contains", value="phone")

## Date queries — use starts_with or date_range or group_by_format
Q: "2023년 12월 판매 건수"
→ filter_nodes(table="sold_hist", property="sold_dtm", op="starts_with", value="2023-12")

Q: "2023년 여름(6-8월) 판매"
→ filter_nodes(table="sold_hist", property="sold_dtm", op="date_range", value="2023-06-01..2023-08-31")

Q: "월별 매출 추이"
→ aggregate_nodes(table="sold_hist", group_by="sold_dtm", group_by_format="YYYY-MM", metric="count")

## Multi-hop chaining — pass previous step's node_titles or group values as from_ids
Q: "판매량 1위 상품의 리뷰 평점 평균"
Step 1: aggregate_nodes(table="sold_hist", group_by="goods_no", metric="sum", metric_property="sold_qunt")
  → top groups include {"group": "G00001", "node_title": "pr_goods_base:G00001"}
Step 2: aggregate_nodes(table="feedback", group_by="score", metric="count",
                         where_property="goods_no", where_op="==", where_value="G00001")

Q: "5점 리뷰 최다 상품 중 가장 저렴한 것"
Step 1: aggregate_nodes(table="feedback", group_by="goods_no", metric="count",
                         where_property="score", where_op="==", where_value="5")
  → groups=[{node_title:"pr_goods_base:G00857"}, ...]
Step 2: filter_nodes(from_ids=["pr_goods_base:G00857","pr_goods_base:G00472"],
                      property="sales_prc", op=">=", value="0")
  → then pick the cheapest from results

Q: "iPhone과 Galaxy Book의 판매 이력"
→ join_related(from_values=["G00007","G00003"], fk_property="goods_no", target_table="pr_goods_sold_hist")

## Language fallback
- If data contains English product names, try English keywords when Korean search returns 0
- Example: "치즈" returns 0 → try "cheese" instead
"""

AGENT_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "deep_search",
            "description": "Search + expand + read in ONE call.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string"},
                    "category": {"type": "string"},
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search",
            "description": "Basic text search.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string"},
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "filter_nodes",
            "description": "Filter by property. Returns {total, showing, results}. Supports multi-hop chaining via from_ids.",
            "parameters": {
                "type": "object",
                "properties": {
                    "table": {
                        "type": "string",
                        "description": "Table name from metadata e.g. pr_goods_base",
                    },
                    "property": {"type": "string", "description": "Column name e.g. sales_prc"},
                    "op": {
                        "type": "string",
                        "description": ">=, <=, >, <, ==, !=, contains, starts_with, date_range",
                    },
                    "value": {
                        "type": "string",
                        "description": "Value. For date_range: '2023-06-01..2023-08-31'. For starts_with: prefix like '2023-12'",
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Max results to return (default 20). Use higher for listings.",
                    },
                    "from_ids": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Optional: restrict to these node titles/IDs (multi-hop chaining from previous step's results)",
                    },
                },
                "required": ["property", "op", "value"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "aggregate_nodes",
            "description": "GROUP BY + COUNT/SUM/AVG/MAX/MIN with WHERE pre-filter, date bucketing, and multi-hop chaining.",
            "parameters": {
                "type": "object",
                "properties": {
                    "table": {"type": "string", "description": "Table name from metadata"},
                    "group_by": {"type": "string", "description": "Column to group by"},
                    "metric": {"type": "string", "enum": ["count", "sum", "avg", "max", "min"]},
                    "metric_property": {
                        "type": "string",
                        "description": "Numeric column for sum/avg/max/min",
                    },
                    "where_property": {
                        "type": "string",
                        "description": "Pre-filter column e.g. score",
                    },
                    "where_op": {
                        "type": "string",
                        "description": "==, !=, >=, <=, >, <, contains, starts_with, date_range",
                    },
                    "where_value": {"type": "string", "description": "Pre-filter value e.g. 5"},
                    "group_by_format": {
                        "type": "string",
                        "description": "Date bucket format: 'YYYY', 'YYYY-MM', 'YYYY-MM-DD'. Use for monthly/yearly aggregation on datetime columns.",
                    },
                    "limit": {"type": "integer", "description": "Max groups (default 50)"},
                    "from_ids": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Optional: restrict aggregation to these node titles/IDs (multi-hop chaining)",
                    },
                },
                "required": ["group_by"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "join_related",
            "description": "FK lookup — find related records. Accepts single from_value OR list of from_values for batch JOIN.",
            "parameters": {
                "type": "object",
                "properties": {
                    "from_value": {"type": "string", "description": "Single FK value e.g. G00001"},
                    "from_values": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Multiple FK values for batch IN-clause JOIN (multi-hop chaining)",
                    },
                    "fk_property": {"type": "string", "description": "FK column e.g. goods_no"},
                    "target_table": {
                        "type": "string",
                        "description": "Target table e.g. pr_goods_sold_hist",
                    },
                    "limit": {"type": "integer", "description": "Max results (default 20)"},
                },
                "required": ["fk_property", "target_table"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_document",
            "description": "Read a full document.",
            "parameters": {
                "type": "object",
                "properties": {
                    "doc_id": {"type": "string"},
                    "query": {"type": "string"},
                },
                "required": ["doc_id"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "expand",
            "description": "Explore graph neighbours of a node — follow edges to discover related nodes (FK-linked rows, document chunks, category siblings).",
            "parameters": {
                "type": "object",
                "properties": {
                    "node_id": {"type": "string", "description": "Node ID to expand from"},
                },
                "required": ["node_id"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "follow",
            "description": "Follow a specific edge type from a node. Edge types: contains, part_of, next_chunk, related, mentions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "node_id": {"type": "string", "description": "Source node ID"},
                    "edge_kind": {
                        "type": "string",
                        "description": "Edge type to follow: related, contains, part_of, etc.",
                    },
                },
                "required": ["node_id", "edge_kind"],
            },
        },
    },
]


def _extract_ids(data: dict, found_ids: set[str], known_tables: set[str] | None = None) -> None:
    """Extract ALL possible document identifiers from any tool result.

    Covers every tool's response structure:
    - evidence[].document_id, evidence[].properties.doc_id, evidence[].title
    - results[].properties.doc_id, results[].title
    - merged_evidence[].document_id
    - document_excerpts[].document.properties.doc_id
    - sub_results[].top_result.document_id
    - document.properties.doc_id (get_document)
    - chunks[].properties (get_document)
    - groups[].group (aggregate — group value may be a PK like goods_no)

    Args:
        known_tables: Set of actual table names from the graph (e.g. {"colors", "products"}).
            Used to resolve FK column stems to real table names for aggregate groups.
    """
    # Flat item lists
    for key in (
        "evidence",
        "results",
        "merged_evidence",
        "matches",
        "expanded_neighbours",
        "neighbours",
    ):
        for item in data.get(key, []):
            # Direct document_id field (from EvidenceAggregator)
            did = item.get("document_id", "")
            if did:
                found_ids.add(did)
            # properties.doc_id
            props = item.get("properties", {})
            did2 = props.get("doc_id", "")
            if did2:
                found_ids.add(did2)
            # title (for assort: "products:12800000")
            title = item.get("title", "")
            if title:
                found_ids.add(title)

    # document_excerpts (from deep_search)
    for excerpt in data.get("document_excerpts", []):
        doc = excerpt.get("document", {})
        did = doc.get("properties", {}).get("doc_id", "")
        if did:
            found_ids.add(did)
        title = doc.get("title", "")
        if title:
            found_ids.add(title)

    # sub_results (from compare_search)
    for sub in data.get("sub_results", []):
        top = sub.get("top_result")
        if isinstance(top, dict):
            did = top.get("document_id", "")
            if did:
                found_ids.add(did)
            props = top.get("properties", {})
            did2 = props.get("doc_id", "")
            if did2:
                found_ids.add(did2)

    # get_document response
    doc_data = data.get("document", {})
    if isinstance(doc_data, dict):
        did = doc_data.get("properties", {}).get("doc_id", "")
        if did:
            found_ids.add(did)

    # filter_nodes / join_related results already covered by "results" above

    # aggregate groups — group value may be a PK (e.g. goods_no "G00001")
    agg_info = data.get("aggregation", {})
    agg_table = agg_info.get("table", "")
    group_by = agg_info.get("group_by", "")
    for grp in data.get("groups", []):
        g = grp.get("group", "")
        if not g:
            continue
        # Add raw value
        found_ids.add(g)
        # If aggregate tool provided node_title (Phase 2B), use it directly
        nt = grp.get("node_title", "")
        if nt:
            found_ids.add(nt)

        # Heuristic prefix generation is only useful when the group value
        # looks like a primary key (short, identifier-like). Skip for
        # dates, long strings, spaces, or non-PK-looking values to avoid
        # flooding found_ids with noise like "pr_sold_base:2023-12-20...".
        looks_like_pk = (
            g
            and len(g) <= 30
            and " " not in g
            and "-" not in g[:5]  # not a date prefix
            and not g.startswith("20")  # reject common year-start strings
        )
        if not looks_like_pk:
            continue

        # Source table prefix
        if agg_table:
            found_ids.add(f"{agg_table}:{g}")

        # Resolve FK column → target table name
        if group_by:
            # Strip trailing _no/_id/_code to get stem
            base = group_by.rsplit("_", 1)[0] if "_" in group_by else group_by

            # Method 1: Match against known table names (most reliable)
            if known_tables:
                for tbl in known_tables:
                    # "color" matches "colors", "goods" matches "pr_goods_base"
                    tbl_lower = tbl.lower()
                    if base in tbl_lower or tbl_lower.startswith(base):
                        found_ids.add(f"{tbl}:{g}")

            # Method 2: Heuristic fallbacks (singular, plural, prefixed)
            for candidate in (
                f"{base}:{g}",
                f"{base}s:{g}",  # plural: color → colors
                f"{base}es:{g}",  # plural: address → addresses
                f"pr_{base}_base:{g}",  # Korean DB: goods → pr_goods_base
                f"pr_{base}:{g}",
            ):
                found_ids.add(candidate)


async def _agent_dispatch(name, args, backend, session, *, embedder=None):
    """Route agent tool calls to synaptic tools."""
    from synaptic.agent_tools import (
        expand_tool,
        follow_tool,
        get_document_tool,
        search_tool,
    )
    from synaptic.agent_tools_structured import (
        aggregate_nodes_tool,
        filter_nodes_tool,
        join_related_tool,
    )
    from synaptic.agent_tools_v2 import deep_search_tool

    if name == "deep_search":
        r = await deep_search_tool(
            backend,
            session,
            args.get("query", ""),
            category=args.get("category"),
            embedder=embedder,
        )
    elif name == "search":
        r = await search_tool(backend, session, args.get("query", ""), embedder=embedder)
    elif name == "expand":
        r = await expand_tool(backend, session, args.get("node_id", ""))
    elif name == "follow":
        r = await follow_tool(
            backend, session, args.get("node_id", ""), args.get("edge_kind", "related")
        )
    elif name == "filter_nodes":
        r = await filter_nodes_tool(
            backend,
            session,
            table=args.get("table", ""),
            property=args.get("property", ""),
            op=args.get("op", "contains"),
            value=args.get("value", ""),
            limit=int(args.get("limit", 20)),
            from_ids=args.get("from_ids") or None,
        )
    elif name == "aggregate_nodes":
        r = await aggregate_nodes_tool(
            backend,
            session,
            table=args.get("table", ""),
            group_by=args.get("group_by", ""),
            metric=args.get("metric", "count"),
            metric_property=args.get("metric_property", ""),
            where_property=args.get("where_property", ""),
            where_op=args.get("where_op", ""),
            where_value=args.get("where_value", ""),
            group_by_format=args.get("group_by_format", ""),
            limit=int(args.get("limit", 50)),
            from_ids=args.get("from_ids") or None,
        )
    elif name == "join_related":
        r = await join_related_tool(
            backend,
            session,
            from_value=args.get("from_value", ""),
            from_values=args.get("from_values") or None,
            fk_property=args.get("fk_property", ""),
            target_table=args.get("target_table", ""),
            limit=int(args.get("limit", 20)),
        )
    elif name == "get_document":
        r = await get_document_tool(backend, session, args["doc_id"], query=args.get("query", ""))
    else:
        return {"error": f"unknown: {name}"}
    return r.to_dict()


async def _llm_judge(
    client: Any,
    query: str,
    agent_answer: str,
    relevant_samples: list[str],
) -> bool:
    """Ask an LLM whether the agent's answer semantically satisfies the query.

    Compares the agent's final text answer against ground-truth sample node
    titles. Returns True if the answer is a plausible response to the query,
    even if it doesn't match the exact GT IDs.
    """
    if not agent_answer.strip():
        return False
    prompt = f"""Judge whether the agent answer correctly addresses the query.

Query: {query}

Expected answer domain (sample relevant items — just examples, not exhaustive):
{", ".join(relevant_samples[:5])}

Agent answer:
{agent_answer[:1500]}

Rules:
- Answer YES if the response is a reasonable, factually plausible answer.
- Answer NO only if the agent completely failed or gave a clearly wrong response.
- Do NOT require exact ID matches — the samples are just examples, many
  other valid items may exist.
- For counting / listing queries: YES if the count or category is correct.
- For filter / search queries: YES if the returned items satisfy the criteria.
- For document queries: YES if the answer discusses the right topic area
  (even if specific document IDs differ from samples).
- For recommendation queries: YES if any reasonable recommendation is given.
- For multi-hop queries: YES if the final answer is correct, regardless of
  intermediate IDs.

Reply with only YES or NO."""
    try:
        resp = await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=10,
            temperature=0,
        )
        return "YES" in (resp.choices[0].message.content or "").upper()
    except Exception:
        return False


async def run_agent_benchmark(
    cfg: DatasetConfig,
    api_key: str,
    model: str = "gpt-4o-mini",
    max_turns: int = 3,
    embed_url: str | None = None,
    embed_model: str = "qwen3-embedding:4b",
    judge: bool = False,
    llm_base_url: str | None = None,
) -> RunResult:
    """Run multi-turn agent on a custom dataset's hard queries."""
    if not cfg.query_path or not cfg.query_path.exists():
        return RunResult(name=cfg.name + " (agent)", error="queries not found")
    if not cfg.path.exists():
        return RunResult(name=cfg.name + " (agent)", error="graph not found")

    import os

    os.environ["OPENAI_API_KEY"] = api_key or "ollama"

    from openai import AsyncOpenAI

    from synaptic.backends.sqlite_graph import SqliteGraphBackend
    from synaptic.search_session import SearchSession, build_graph_context

    client = AsyncOpenAI(base_url=llm_base_url) if llm_base_url else AsyncOpenAI()
    backend = SqliteGraphBackend(str(cfg.path))
    await backend.connect()

    # Build embedder for agent search tools (same as run_custom_dataset)
    embedder = None
    if embed_url:
        from synaptic.extensions.embedder import OpenAIEmbeddingProvider

        embedder = OpenAIEmbeddingProvider(api_base=embed_url, model=embed_model)

    graph_ctx = await build_graph_context(backend)
    system = AGENT_SYSTEM + "\n\n" + graph_ctx

    # Collect known table names for _extract_ids matching
    from synaptic.models import NodeKind as _NK

    _sample = await backend.list_nodes(kind=_NK.ENTITY, limit=50_000)
    known_tables: set[str] = set()
    for _n in _sample:
        _tbl = (_n.properties or {}).get("_table_name")
        if _tbl:
            known_tables.add(_tbl)

    with open(cfg.query_path, encoding="utf-8") as f:
        gt = json.load(f)
    queries = gt.get("queries", [])
    id_field = gt.get("id_field", "doc_id")

    solved = 0