Skip to content

Commit 9a3941c

Browse files
committed
docs(eval): multi-model ontology benchmark + LinkedIn deck (Nano Banana upcycled)
5-model eval (Opus/Sonnet/Haiku/Gemini 2.5 Pro/GLM-5.2) on the re-ingested ontology: ontology-augmented vs control, F1 0.37 -> 0.81 (+0.44 mean), hallucination ~halved; universal lift across every model. Subclass retrieval fixed (children query). - ontology-augment-linkedin-deck.pdf (10 slides, Nano Banana Pro upcycled, 4:3) - ontology-augment-deck-beamer.pdf + deck.tex (PGFPlots source) - linkedin-slides/slide-01..10.jpg (upcycled stills) - summary-mm.json, results-mm.csv, gold.json (new-KG gold) Co-Authored-By: jjohare <github@thedreamlab.uk>
1 parent 19292ee commit 9a3941c

16 files changed

Lines changed: 412 additions & 0 deletions

docs/eval/deck.tex

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
\documentclass[aspectratio=43,11pt]{beamer}
2+
\usepackage{fontspec}
3+
\usepackage{pgfplots}\pgfplotsset{compat=1.18}
4+
\usepackage{tikz}\usetikzlibrary{positioning}
5+
\usecolortheme{default}
6+
\setbeamertemplate{navigation symbols}{}
7+
\definecolor{teal0}{RGB}{0,128,128}
8+
\definecolor{burnt}{RGB}{204,85,0}
9+
\definecolor{ink}{RGB}{33,37,41}
10+
\definecolor{paper}{RGB}{250,249,246}
11+
\setbeamercolor{background canvas}{bg=paper}
12+
\setbeamercolor{frametitle}{fg=teal0}
13+
\setbeamercolor{title}{fg=teal0}
14+
\setbeamercolor{structure}{fg=burnt}
15+
\setbeamercolor{normal text}{fg=ink}
16+
\setbeamerfont{frametitle}{series=\bfseries}
17+
\setbeamertemplate{frametitle}{\vskip6pt\usebeamerfont{frametitle}\usebeamercolor[fg]{frametitle}\insertframetitle\par\vskip-2pt{\color{burnt}\rule{\linewidth}{1.2pt}}}
18+
\setbeamertemplate{itemize item}{\color{burnt}\textbullet}
19+
\setbeamertemplate{itemize subitem}{\color{teal0}--}
20+
\newcommand{\foot}[1]{\vfill{\scriptsize\color{teal0}#1}}
21+
\pgfplotsset{
22+
every axis/.append style={font=\small, axis line style={gray!50}, tick style={gray!50},
23+
grid=major, major grid style={gray!20}, label style={color=ink}, tick label style={color=ink}},
24+
A/.style={fill=teal0,draw=teal0!70}, C/.style={fill=burnt,draw=burnt!70},
25+
}
26+
27+
\title{\textbf{Grounding LLMs in a Formal Ontology}}
28+
\subtitle{A pervasive knowledge-graph binding that makes every model measurably smarter}
29+
\author{\textbf{VisionFlow} \textbullet\ VisionClaw \textbullet\ Agentbox}
30+
\date{\textcolor{burnt}{\url{http://www.visionflow.info}} \quad\textbullet\quad 2026-06-14}
31+
32+
\begin{document}
33+
34+
{\setbeamertemplate{footline}{}
35+
\begin{frame}[plain]
36+
\vfill\centering
37+
{\color{teal0}\Huge\textbf{Grounding LLMs in a\\[2pt] Formal Ontology}\par}
38+
\vskip10pt
39+
{\large A pervasive knowledge-graph binding that makes\\ \emph{every} model measurably smarter\par}
40+
\vskip16pt
41+
{\color{burnt}\Large\textbf{F1 0.37 \;$\rightarrow$\; 0.81}}\;{\normalsize across 5 LLMs}\par
42+
\vskip20pt
43+
{\large\textbf{VisionFlow} \;\textbullet\; VisionClaw \;\textbullet\; Agentbox\par}
44+
\vskip6pt
45+
{\color{burnt}\large\url{http://www.visionflow.info}\par}
46+
\vfill
47+
\end{frame}}
48+
49+
\begin{frame}{The headline}
50+
\begin{center}
51+
\vskip4pt
52+
{\Large Connecting a \textbf{formal ontology} (4{,}196 OWL classes, 222k inferred axioms)\\ to \emph{every} AI call lifts factual recall \textbf{across the board}.}
53+
\vskip14pt
54+
\begin{tikzpicture}
55+
\node[draw=teal0,line width=1.2pt,rounded corners,inner sep=10pt] {\color{teal0}\Huge\textbf{+0.44 mean F1}};
56+
\end{tikzpicture}
57+
\vskip10pt
58+
{\large Augmented \textbf{0.81} vs.\ ungrounded \textbf{0.37} \quad\textbullet\quad hallucination roughly \textbf{halved}}\\[2pt]
59+
{\normalsize 5 models \;\textbullet\; 16 KG-grounded questions \;\textbullet\; 160 isolated runs \;\textbullet\; objective scoring}
60+
\end{center}
61+
\foot{Lead, not buried: the binding works, and it is model-agnostic.}
62+
\end{frame}
63+
64+
\begin{frame}{What we built}
65+
\begin{itemize}
66+
\item \textbf{VisionFlow} --- the immersive 3D knowledge-graph + agent platform (\url{visionflow.info}).
67+
\item \textbf{VisionClaw} --- the Rust engine: Oxigraph/Whelk ontology store, GPU physics, real-time graph.
68+
\item \textbf{Agentbox} --- the sovereign agent runtime; 100+ skills, MCP tooling, governed memory.
69+
\end{itemize}
70+
\vskip6pt
71+
{\color{teal0}\textbf{The final piece:}} a \emph{pervasive ontology binding} so any AI call can ground itself in the
72+
formal knowledge graph --- read-pervasive, write-governed, budget-bounded, fail-open.
73+
\foot{Features are legion; this deck leads with the one that ties them together.}
74+
\end{frame}
75+
76+
\begin{frame}{The binding, in one picture}
77+
\centering
78+
\begin{tikzpicture}[node distance=7mm,every node/.style={font=\small}]
79+
\node[draw=teal0,line width=1pt,rounded corners,fill=teal0!8,inner sep=6pt,text width=3.2cm,align=center] (kg) {\textbf{Knowledge Graph}\\Oxigraph + Whelk\\4{,}196 classes\\222k inferred};
80+
\node[draw=burnt,line width=1pt,rounded corners,fill=burnt!8,inner sep=6pt,text width=3.4cm,align=center,right=22mm of kg] (brain) {\textbf{One retrieval brain}\\\texttt{ontology\_ask}\\budget-bounded \textbullet\ fail-open};
81+
\node[draw=ink,rounded corners,inner sep=5pt,text width=3.0cm,align=center,above right=6mm and 14mm of brain] (push) {\textbf{PUSH}\\per-turn breadcrumb};
82+
\node[draw=ink,rounded corners,inner sep=5pt,text width=3.0cm,align=center,below right=6mm and 14mm of brain] (pull) {\textbf{PULL}\\subgraph on demand};
83+
\draw[->,teal0,line width=1pt] (kg)--(brain);
84+
\draw[->,burnt,line width=1pt] (brain.east)--(push.west);
85+
\draw[->,burnt,line width=1pt] (brain.east)--(pull.west);
86+
\end{tikzpicture}
87+
\vskip8pt
88+
\begin{itemize}\small
89+
\item \textbf{Read-pervasive:} every agent, consultant and turn can consult the KG.
90+
\item \textbf{Write-governed:} proposals are auth-gated and queued; derived facts are fenced.
91+
\end{itemize}
92+
\foot{One shared library --- the MCP tool, the consultant seam and the CLI share identical grounding.}
93+
\end{frame}
94+
95+
\begin{frame}{How we measured it (objectively)}
96+
\begin{itemize}
97+
\item \textbf{KG-as-oracle:} ground truth generated \emph{from the graph itself} --- neighbours, subclasses,
98+
class existence --- so scoring is deterministic, not subjective.
99+
\item \textbf{Clean A/B:} each cell is an \emph{isolated} session given only the question;
100+
augmented arm receives the ontology subgraph, control uses parametric knowledge only.
101+
\item \textbf{5 models $\times$ 16 questions $\times$ 2 conditions} = \textbf{160 isolated runs}.
102+
\item \textbf{Grader:} precision / recall / F1 + hallucination, token-set matched.
103+
\end{itemize}
104+
\foot{Anthropic Opus/Sonnet/Haiku, Google Gemini 2.5 Pro, Z.AI GLM-5.2.}
105+
\end{frame}
106+
107+
\begin{frame}{Result: every model wins}
108+
\centering
109+
\begin{tikzpicture}
110+
\begin{axis}[ybar,width=11cm,height=6.2cm,bar width=9pt,ymin=0,ymax=1,ylabel={Mean F1},
111+
symbolic x coords={Opus 4.8,Sonnet 4.6,Haiku 4.5,Gemini 2.5 Pro,GLM-5.2},xtick=data,x tick label style={rotate=20,anchor=east,font=\footnotesize},
112+
enlarge x limits=0.12,legend style={at={(0.5,-0.28)},anchor=north,legend columns=2,draw=gray!40},
113+
nodes near coords,nodes near coords style={font=\tiny}]
114+
\addplot[A] coordinates {(Opus 4.8,0.805) (Sonnet 4.6,0.845) (Haiku 4.5,0.817) (Gemini 2.5 Pro,0.778) (GLM-5.2,0.817)};
115+
\addplot[C] coordinates {(Opus 4.8,0.385) (Sonnet 4.6,0.354) (Haiku 4.5,0.273) (Gemini 2.5 Pro,0.473) (GLM-5.2,0.362)};
116+
\legend{Ontology-augmented,Control (parametric only)}
117+
\end{axis}\end{tikzpicture}
118+
\foot{Universal lift: +0.31 to +0.54 F1. The smallest model (Haiku) gains the most.}
119+
\end{frame}
120+
121+
\begin{frame}{Result: hallucination roughly halved}
122+
\centering
123+
\begin{tikzpicture}
124+
\begin{axis}[ybar,width=11cm,height=6.2cm,bar width=9pt,ymin=0,ymax=1,ylabel={Hallucination rate},
125+
symbolic x coords={Opus 4.8,Sonnet 4.6,Haiku 4.5,Gemini 2.5 Pro,GLM-5.2},xtick=data,x tick label style={rotate=20,anchor=east,font=\footnotesize},
126+
enlarge x limits=0.12,legend style={at={(0.5,-0.28)},anchor=north,legend columns=2,draw=gray!40},
127+
nodes near coords,nodes near coords style={font=\tiny}]
128+
\addplot[A] coordinates {(Opus 4.8,0.151) (Sonnet 4.6,0.12) (Haiku 4.5,0.073) (Gemini 2.5 Pro,0.125) (GLM-5.2,0.135)};
129+
\addplot[C] coordinates {(Opus 4.8,0.573) (Sonnet 4.6,0.594) (Haiku 4.5,0.758) (Gemini 2.5 Pro,0.552) (GLM-5.2,0.64)};
130+
\legend{Ontology-augmented,Control}
131+
\end{axis}\end{tikzpicture}
132+
\foot{Grounding replaces plausible-but-wrong guesses with the graph's actual vocabulary.}
133+
\end{frame}
134+
135+
\begin{frame}{Where grounding helps most}
136+
\centering
137+
\begin{tikzpicture}
138+
\begin{axis}[ybar,width=10cm,height=5.8cm,bar width=16pt,ymin=0,ymax=1,ylabel={Mean F1 (all models)},
139+
symbolic x coords={neighbour,subclass,existence},xtick=data,enlarge x limits=0.3,
140+
legend style={at={(0.5,-0.22)},anchor=north,legend columns=2,draw=gray!40},
141+
nodes near coords,nodes near coords style={font=\footnotesize}]
142+
\addplot[A] coordinates {(neighbour,0.917) (subclass,0.55) (existence,0.865)};
143+
\addplot[C] coordinates {(neighbour,0.458) (subclass,0.048) (existence,0.513)};
144+
\legend{Augmented,Control}
145+
\end{axis}\end{tikzpicture}
146+
\foot{Biggest gains on proprietary structure (subclasses: +0.50) and niche concepts the base model can't know.}
147+
\end{frame}
148+
149+
\begin{frame}{What it costs}
150+
\begin{itemize}
151+
\item Grounding adds context tokens and one retrieval round-trip --- \textbf{optional and per-call}.
152+
\item Gemini 2.5 Pro: ~4198 prompt tokens/query; GLM-5.2: ~3620 --- modest for the accuracy gained.
153+
\item \textbf{Budget-bounded \& fail-open:} if the graph is unreachable, the turn proceeds ungrounded --- never blocked.
154+
\end{itemize}
155+
\vskip6pt
156+
{\color{teal0}\textbf{Net:}} a bounded, switchable cost for a large, universal accuracy gain.
157+
\foot{The binding augments thinking without overpowering the context window.}
158+
\end{frame}
159+
160+
\begin{frame}{The design works}
161+
\begin{center}
162+
{\Large A formal ontology, bound pervasively to AI,\\ makes \textbf{every} model more accurate and less hallucinatory.}
163+
\vskip10pt
164+
\begin{itemize}
165+
\item \textbf{+0.44 mean F1} across 5 LLMs; hallucination roughly halved.
166+
\item Read-pervasive, write-governed, budget-bounded, fail-open --- production-shaped.
167+
\item One shared brain across tool, consultant and CLI surfaces.
168+
\end{itemize}
169+
\vskip12pt
170+
{\color{burnt}\Large\textbf{VisionFlow}}\;\textbullet\;{\large VisionClaw \textbullet\ Agentbox}\\[4pt]
171+
{\color{burnt}\large\url{http://www.visionflow.info}}
172+
\end{center}
173+
\end{frame}
174+
175+
\end{document}

docs/eval/gold.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@
255255
"model-context-protocol",
256256
"open-ai-agents-sdk",
257257
"open-telemetry",
258+
"openai-research-organisation-agents-sdk",
258259
"openai-research-organisation-research-organisation-agents-sdk",
259260
"orchestration",
260261
"orchestration-protocol",
1.65 MB
Loading
1.83 MB
Loading
1.98 MB
Loading
1.8 MB
Loading
2.02 MB
Loading
1.68 MB
Loading
1.55 MB
Loading
1.74 MB
Loading

0 commit comments

Comments
 (0)