|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Deterministic KG-as-oracle grader (PRD-020 ontology-augment eval). |
| 3 | +
|
| 4 | +Rebuilt from REPORT_DATA.md spec. Grades a model's predicted localnames for each |
| 5 | +gold question via token-set greedy 1:1 matching (singularised), then aggregates |
| 6 | +recall@K / precision / F1 / hallucination by model, by question-type, and overall |
| 7 | +— matching the summary-mm.json shape so the LaTeX report consumes it unchanged. |
| 8 | +
|
| 9 | +Matching rule (per question): |
| 10 | + - normalise each term: lowercase, split on non-alnum, drop a trailing 's' |
| 11 | + (crude singularisation), as a token SET. |
| 12 | + - a predicted term MATCHES a gold term iff their token sets overlap on the |
| 13 | + shorter set's length (subset-or-equal) OR share >= 2 tokens. Greedy 1:1: |
| 14 | + each gold can be claimed once, each prediction claims at most one gold. |
| 15 | + - recall@K: cap predictions to K (12 for neighbour/subclass) before matching; |
| 16 | + existence uses any-match recall (did the model assert the right localname). |
| 17 | + - precision = matched / predictions_considered ; F1 = harmonic mean ; |
| 18 | + hallucination = unmatched_predictions / predictions_considered. |
| 19 | +
|
| 20 | +Input JSONL (one row per cell): {question_id, model, arm, rep, predictions:[...]} |
| 21 | + arm ∈ {aug, ctl}. gold.json supplies the gold per question_id (index or seed). |
| 22 | +
|
| 23 | +Usage: grade.py <gold.json> <predictions.jsonl> [out_summary.json] [out_rows.csv] |
| 24 | +""" |
| 25 | +import json |
| 26 | +import sys |
| 27 | +import csv |
| 28 | +import re |
| 29 | +from collections import defaultdict |
| 30 | + |
| 31 | +RECALL_K = 12 |
| 32 | + |
| 33 | + |
| 34 | +def toks(term): |
| 35 | + parts = [p for p in re.split(r"[^a-z0-9]+", str(term).lower()) if p] |
| 36 | + return frozenset(p[:-1] if len(p) > 3 and p.endswith("s") else p for p in parts) |
| 37 | + |
| 38 | + |
| 39 | +def matches(pred_t, gold_t): |
| 40 | + if not pred_t or not gold_t: |
| 41 | + return False |
| 42 | + inter = len(pred_t & gold_t) |
| 43 | + if inter == 0: |
| 44 | + return False |
| 45 | + shorter = min(len(pred_t), len(gold_t)) |
| 46 | + return inter >= shorter or inter >= 2 |
| 47 | + |
| 48 | + |
| 49 | +def grade_cell(predictions, gold, qtype): |
| 50 | + preds = list(predictions) |
| 51 | + if qtype in ("neighbour", "subclass"): |
| 52 | + preds = preds[:RECALL_K] |
| 53 | + pred_t = [toks(p) for p in preds] |
| 54 | + gold_t = [toks(g) for g in gold] |
| 55 | + claimed = set() |
| 56 | + matched = 0 |
| 57 | + for pt in pred_t: |
| 58 | + for gi, gt in enumerate(gold_t): |
| 59 | + if gi in claimed: |
| 60 | + continue |
| 61 | + if matches(pt, gt): |
| 62 | + claimed.add(gi) |
| 63 | + matched += 1 |
| 64 | + break |
| 65 | + n_pred = max(1, len(preds)) |
| 66 | + n_gold = max(1, len(gold)) |
| 67 | + if qtype == "existence": |
| 68 | + recall = 1.0 if matched >= 1 else 0.0 |
| 69 | + else: |
| 70 | + recall = matched / n_gold |
| 71 | + precision = matched / n_pred |
| 72 | + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0 |
| 73 | + halluc = (len(preds) - matched) / n_pred |
| 74 | + return {"recall": recall, "precision": precision, "f1": f1, "halluc": halluc} |
| 75 | + |
| 76 | + |
| 77 | +def main(): |
| 78 | + gold_path, pred_path = sys.argv[1], sys.argv[2] |
| 79 | + out_summary = sys.argv[3] if len(sys.argv) > 3 else "/dev/stdout" |
| 80 | + out_rows = sys.argv[4] if len(sys.argv) > 4 else None |
| 81 | + |
| 82 | + goldj = json.load(open(gold_path)) |
| 83 | + gold_items = goldj if isinstance(goldj, list) else goldj.get("questions") or goldj.get("items") |
| 84 | + gold_by_id = {} |
| 85 | + for i, it in enumerate(gold_items): |
| 86 | + for key in (it.get("seed"), it.get("label"), it.get("concept"), str(i)): |
| 87 | + if key: |
| 88 | + gold_by_id[str(key)] = it |
| 89 | + gold_by_id[i] = it # numeric index too |
| 90 | + |
| 91 | + rows = [] |
| 92 | + with open(pred_path) as fh: |
| 93 | + for line in fh: |
| 94 | + line = line.strip() |
| 95 | + if not line: |
| 96 | + continue |
| 97 | + r = json.loads(line) |
| 98 | + qid = r.get("question_id") |
| 99 | + it = gold_by_id.get(qid) if not isinstance(qid, int) else gold_items[qid] |
| 100 | + if it is None: |
| 101 | + it = gold_by_id.get(str(qid)) |
| 102 | + if it is None: |
| 103 | + continue |
| 104 | + g = grade_cell(r.get("predictions", []), it.get("gold", []), it.get("type", "neighbour")) |
| 105 | + rows.append({"model": r.get("model", "?"), "arm": r.get("arm", "?"), |
| 106 | + "type": it.get("type", "neighbour"), "qid": str(qid), **g}) |
| 107 | + |
| 108 | + def agg(filt): |
| 109 | + sel = [r for r in rows if filt(r)] |
| 110 | + if not sel: |
| 111 | + return None |
| 112 | + n = len(sel) |
| 113 | + return {k: round(sum(r[k] for r in sel) / n, 4) for k in ("f1", "recall", "precision", "halluc")} |
| 114 | + |
| 115 | + models = sorted({r["model"] for r in rows}) |
| 116 | + types = sorted({r["type"] for r in rows}) |
| 117 | + summary = {"by_model": {}, "by_type": {}, "overall": {}} |
| 118 | + for m in models: |
| 119 | + a = agg(lambda r, m=m: r["model"] == m and r["arm"] == "aug") |
| 120 | + c = agg(lambda r, m=m: r["model"] == m and r["arm"] == "ctl") |
| 121 | + summary["by_model"][m] = { |
| 122 | + "aug": (a or {}).get("f1"), "ctl": (c or {}).get("f1"), |
| 123 | + "delta": round(((a or {}).get("f1", 0) - (c or {}).get("f1", 0)), 4), |
| 124 | + "aug_halluc": (a or {}).get("halluc"), "ctl_halluc": (c or {}).get("halluc"), |
| 125 | + } |
| 126 | + for t in types: |
| 127 | + a = agg(lambda r, t=t: r["type"] == t and r["arm"] == "aug") |
| 128 | + c = agg(lambda r, t=t: r["type"] == t and r["arm"] == "ctl") |
| 129 | + summary["by_type"][t] = {"aug": (a or {}).get("f1"), "ctl": (c or {}).get("f1"), |
| 130 | + "delta": round(((a or {}).get("f1", 0) - (c or {}).get("f1", 0)), 4)} |
| 131 | + ao = agg(lambda r: r["arm"] == "aug") |
| 132 | + co = agg(lambda r: r["arm"] == "ctl") |
| 133 | + summary["overall"] = {"aug": (ao or {}).get("f1"), "ctl": (co or {}).get("f1"), |
| 134 | + "delta": round(((ao or {}).get("f1", 0) - (co or {}).get("f1", 0)), 4), |
| 135 | + "aug_halluc": (ao or {}).get("halluc"), "ctl_halluc": (co or {}).get("halluc"), |
| 136 | + "n_rows": len(rows)} |
| 137 | + json.dump(summary, open(out_summary, "w"), indent=2) |
| 138 | + if out_rows: |
| 139 | + with open(out_rows, "w", newline="") as fh: |
| 140 | + w = csv.DictWriter(fh, fieldnames=["model", "arm", "type", "qid", "f1", "recall", "precision", "halluc"]) |
| 141 | + w.writeheader() |
| 142 | + w.writerows(rows) |
| 143 | + sys.stderr.write(f"graded {len(rows)} cells; {len(models)} models; overall aug={summary['overall']['aug']} ctl={summary['overall']['ctl']} Δ={summary['overall']['delta']}\n") |
| 144 | + |
| 145 | + |
| 146 | +if __name__ == "__main__": |
| 147 | + main() |
0 commit comments