Skip to content

Commit bd5bab5

Browse files
committed
docs(eval): 7-model ontology benchmark — local DiffusionGemma+ontology beats frontier bare
672 isolated runs across 7 models (Opus/Sonnet/Haiku/Gemini Flash/Flash Lite/Z.AI/DiffusionGemma). DiffusionGemma+ontology F1 0.505 exceeds every frontier model bare (best: Gemini Flash 0.423). Full academic report with BibTeX citations, detailed caveats, DreamLab branding. LinkedIn-ready Beamer deck (9 slides). Deterministic grader + raw prediction data included. Co-Authored-By: jjohare <github@thedreamlab.uk>
1 parent b65942a commit bd5bab5

18 files changed

Lines changed: 3243 additions & 240 deletions

docs/eval/all-preds.jsonl

Lines changed: 672 additions & 0 deletions
Large diffs are not rendered by default.

docs/eval/deck.pdf

211 KB
Binary file not shown.

docs/eval/deck.tex

Lines changed: 196 additions & 142 deletions
Large diffs are not rendered by default.

docs/eval/grade.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#!/usr/bin/env python3
2+
"""Deterministic KG-as-oracle grader (PRD-020 ontology-augment eval).
3+
4+
Rebuilt from REPORT_DATA.md spec. Grades a model's predicted localnames for each
5+
gold question via token-set greedy 1:1 matching (singularised), then aggregates
6+
recall@K / precision / F1 / hallucination by model, by question-type, and overall
7+
— matching the summary-mm.json shape so the LaTeX report consumes it unchanged.
8+
9+
Matching rule (per question):
10+
- normalise each term: lowercase, split on non-alnum, drop a trailing 's'
11+
(crude singularisation), as a token SET.
12+
- a predicted term MATCHES a gold term iff their token sets overlap on the
13+
shorter set's length (subset-or-equal) OR share >= 2 tokens. Greedy 1:1:
14+
each gold can be claimed once, each prediction claims at most one gold.
15+
- recall@K: cap predictions to K (12 for neighbour/subclass) before matching;
16+
existence uses any-match recall (did the model assert the right localname).
17+
- precision = matched / predictions_considered ; F1 = harmonic mean ;
18+
hallucination = unmatched_predictions / predictions_considered.
19+
20+
Input JSONL (one row per cell): {question_id, model, arm, rep, predictions:[...]}
21+
arm ∈ {aug, ctl}. gold.json supplies the gold per question_id (index or seed).
22+
23+
Usage: grade.py <gold.json> <predictions.jsonl> [out_summary.json] [out_rows.csv]
24+
"""
25+
import json
26+
import sys
27+
import csv
28+
import re
29+
from collections import defaultdict
30+
31+
RECALL_K = 12
32+
33+
34+
def toks(term):
35+
parts = [p for p in re.split(r"[^a-z0-9]+", str(term).lower()) if p]
36+
return frozenset(p[:-1] if len(p) > 3 and p.endswith("s") else p for p in parts)
37+
38+
39+
def matches(pred_t, gold_t):
40+
if not pred_t or not gold_t:
41+
return False
42+
inter = len(pred_t & gold_t)
43+
if inter == 0:
44+
return False
45+
shorter = min(len(pred_t), len(gold_t))
46+
return inter >= shorter or inter >= 2
47+
48+
49+
def grade_cell(predictions, gold, qtype):
50+
preds = list(predictions)
51+
if qtype in ("neighbour", "subclass"):
52+
preds = preds[:RECALL_K]
53+
pred_t = [toks(p) for p in preds]
54+
gold_t = [toks(g) for g in gold]
55+
claimed = set()
56+
matched = 0
57+
for pt in pred_t:
58+
for gi, gt in enumerate(gold_t):
59+
if gi in claimed:
60+
continue
61+
if matches(pt, gt):
62+
claimed.add(gi)
63+
matched += 1
64+
break
65+
n_pred = max(1, len(preds))
66+
n_gold = max(1, len(gold))
67+
if qtype == "existence":
68+
recall = 1.0 if matched >= 1 else 0.0
69+
else:
70+
recall = matched / n_gold
71+
precision = matched / n_pred
72+
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
73+
halluc = (len(preds) - matched) / n_pred
74+
return {"recall": recall, "precision": precision, "f1": f1, "halluc": halluc}
75+
76+
77+
def main():
78+
gold_path, pred_path = sys.argv[1], sys.argv[2]
79+
out_summary = sys.argv[3] if len(sys.argv) > 3 else "/dev/stdout"
80+
out_rows = sys.argv[4] if len(sys.argv) > 4 else None
81+
82+
goldj = json.load(open(gold_path))
83+
gold_items = goldj if isinstance(goldj, list) else goldj.get("questions") or goldj.get("items")
84+
gold_by_id = {}
85+
for i, it in enumerate(gold_items):
86+
for key in (it.get("seed"), it.get("label"), it.get("concept"), str(i)):
87+
if key:
88+
gold_by_id[str(key)] = it
89+
gold_by_id[i] = it # numeric index too
90+
91+
rows = []
92+
with open(pred_path) as fh:
93+
for line in fh:
94+
line = line.strip()
95+
if not line:
96+
continue
97+
r = json.loads(line)
98+
qid = r.get("question_id")
99+
it = gold_by_id.get(qid) if not isinstance(qid, int) else gold_items[qid]
100+
if it is None:
101+
it = gold_by_id.get(str(qid))
102+
if it is None:
103+
continue
104+
g = grade_cell(r.get("predictions", []), it.get("gold", []), it.get("type", "neighbour"))
105+
rows.append({"model": r.get("model", "?"), "arm": r.get("arm", "?"),
106+
"type": it.get("type", "neighbour"), "qid": str(qid), **g})
107+
108+
def agg(filt):
109+
sel = [r for r in rows if filt(r)]
110+
if not sel:
111+
return None
112+
n = len(sel)
113+
return {k: round(sum(r[k] for r in sel) / n, 4) for k in ("f1", "recall", "precision", "halluc")}
114+
115+
models = sorted({r["model"] for r in rows})
116+
types = sorted({r["type"] for r in rows})
117+
summary = {"by_model": {}, "by_type": {}, "overall": {}}
118+
for m in models:
119+
a = agg(lambda r, m=m: r["model"] == m and r["arm"] == "aug")
120+
c = agg(lambda r, m=m: r["model"] == m and r["arm"] == "ctl")
121+
summary["by_model"][m] = {
122+
"aug": (a or {}).get("f1"), "ctl": (c or {}).get("f1"),
123+
"delta": round(((a or {}).get("f1", 0) - (c or {}).get("f1", 0)), 4),
124+
"aug_halluc": (a or {}).get("halluc"), "ctl_halluc": (c or {}).get("halluc"),
125+
}
126+
for t in types:
127+
a = agg(lambda r, t=t: r["type"] == t and r["arm"] == "aug")
128+
c = agg(lambda r, t=t: r["type"] == t and r["arm"] == "ctl")
129+
summary["by_type"][t] = {"aug": (a or {}).get("f1"), "ctl": (c or {}).get("f1"),
130+
"delta": round(((a or {}).get("f1", 0) - (c or {}).get("f1", 0)), 4)}
131+
ao = agg(lambda r: r["arm"] == "aug")
132+
co = agg(lambda r: r["arm"] == "ctl")
133+
summary["overall"] = {"aug": (ao or {}).get("f1"), "ctl": (co or {}).get("f1"),
134+
"delta": round(((ao or {}).get("f1", 0) - (co or {}).get("f1", 0)), 4),
135+
"aug_halluc": (ao or {}).get("halluc"), "ctl_halluc": (co or {}).get("halluc"),
136+
"n_rows": len(rows)}
137+
json.dump(summary, open(out_summary, "w"), indent=2)
138+
if out_rows:
139+
with open(out_rows, "w", newline="") as fh:
140+
w = csv.DictWriter(fh, fieldnames=["model", "arm", "type", "qid", "f1", "recall", "precision", "halluc"])
141+
w.writeheader()
142+
w.writerows(rows)
143+
sys.stderr.write(f"graded {len(rows)} cells; {len(models)} models; overall aug={summary['overall']['aug']} ctl={summary['overall']['ctl']} Δ={summary['overall']['delta']}\n")
144+
145+
146+
if __name__ == "__main__":
147+
main()

docs/eval/grounding.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)