Skip to content

Commit 9abe3b4

Browse files
authored
feat(groundedness): retrieval-quality scorer (#282)
Pure scoreGroundedness(resultText, requiredKnowledge[]) -> {score,found, missing,total,hadResults} plus a span-based extractRetrievedText over the canonical TraceSchema (RetrievalSpan.hits + provider ToolSpan.result), the structural sibling of src/authenticity. Provider-tool selection is an injected matcher (default search/research-not-fetch), not a baked literal; requiredKnowledge is a bare string[] supplied by the consumer. Subpath-only export (./groundedness), no root re-export — mirrors authenticity.
1 parent 49d87ff commit 9abe3b4

4 files changed

Lines changed: 367 additions & 0 deletions

File tree

package.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,11 @@
134134
"import": "./dist/authenticity/index.js",
135135
"default": "./dist/authenticity/index.js"
136136
},
137+
"./groundedness": {
138+
"types": "./dist/groundedness/index.d.ts",
139+
"import": "./dist/groundedness/index.js",
140+
"default": "./dist/groundedness/index.js"
141+
},
137142
"./belief-state": {
138143
"types": "./dist/belief-state/index.d.ts",
139144
"import": "./dist/belief-state/index.js",

src/groundedness/index.test.ts

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import { describe, expect, it } from 'vitest'
2+
import type { Span } from '../trace/schema'
3+
import {
4+
defaultProviderToolMatcher,
5+
extractRetrievedText,
6+
scoreGroundedness,
7+
scoreGroundednessForRun,
8+
} from './index'
9+
10+
describe('scoreGroundedness', () => {
11+
it('scores the share of required knowledge the provider surfaced, case-insensitively', () => {
12+
const text = 'The current API uses createMiddleware from hono/factory and streamSSE.'
13+
const r = scoreGroundedness(text, ['createMiddleware', 'streamSSE', 'getRuntimeKey'])
14+
expect(r.total).toBe(3)
15+
expect(r.found.sort()).toEqual(['createMiddleware', 'streamSSE'])
16+
expect(r.missing).toEqual(['getRuntimeKey'])
17+
expect(r.score).toBeCloseTo(2 / 3)
18+
expect(r.hadResults).toBe(true)
19+
})
20+
21+
it('matches case-insensitively but reports keys in their original casing', () => {
22+
const r = scoreGroundedness('imports * as Z from ZOD', ['z', 'zod'])
23+
expect(r.found).toEqual(['z', 'zod'])
24+
expect(r.score).toBe(1)
25+
})
26+
27+
it('dedupes required keys (case-insensitive) so the denominator cannot be inflated', () => {
28+
const r = scoreGroundedness('uses viem', ['viem', 'VIEM', ' viem '])
29+
expect(r.total).toBe(1)
30+
expect(r.score).toBe(1)
31+
})
32+
33+
it('fails open when there is no required knowledge (nothing to ground)', () => {
34+
const r = scoreGroundedness('', [])
35+
expect(r.score).toBe(1)
36+
expect(r.total).toBe(0)
37+
expect(r.hadResults).toBe(false)
38+
})
39+
40+
it('distinguishes "no results" from "results that missed the facts"', () => {
41+
const empty = scoreGroundedness('', ['useReadContract'])
42+
expect(empty.hadResults).toBe(false)
43+
expect(empty.score).toBe(0)
44+
45+
const missed = scoreGroundedness('here is some unrelated prose', ['useReadContract'])
46+
expect(missed.hadResults).toBe(true)
47+
expect(missed.score).toBe(0)
48+
})
49+
})
50+
51+
describe('extractRetrievedText', () => {
52+
const base = { runId: 'r1', name: 'n', startedAt: 0 } as const
53+
54+
it('reads RetrievalSpan hit content', () => {
55+
const spans: Span[] = [
56+
{
57+
...base,
58+
spanId: 's1',
59+
kind: 'retrieval',
60+
query: 'hono factory',
61+
hits: [
62+
{ docId: 'd1', score: 0.9, content: 'createMiddleware from hono/factory' },
63+
{ docId: 'd2', score: 0.4, content: 'streamSSE from hono/streaming' },
64+
{ docId: 'd3', score: 0.1 }, // no content — skipped, not crashed
65+
],
66+
},
67+
]
68+
const text = extractRetrievedText(spans)
69+
expect(text).toContain('createMiddleware')
70+
expect(text).toContain('streamSSE')
71+
})
72+
73+
it('reads provider ToolSpan results by the default matcher, skipping fetch + non-provider tools', () => {
74+
const spans: Span[] = [
75+
{
76+
...base,
77+
spanId: 's1',
78+
kind: 'tool',
79+
toolName: 'web_search',
80+
args: { q: 'viem v2' },
81+
result: { snippets: ['useReadContract is current'] },
82+
},
83+
{
84+
...base,
85+
spanId: 's2',
86+
kind: 'tool',
87+
toolName: 'fetch_url', // search/research-not-fetch default excludes this
88+
args: {},
89+
result: 'getContract legacy',
90+
},
91+
{
92+
...base,
93+
spanId: 's3',
94+
kind: 'tool',
95+
toolName: 'write_file', // not a provider tool at all
96+
args: {},
97+
result: 'irrelevant',
98+
},
99+
]
100+
const text = extractRetrievedText(spans)
101+
expect(text).toContain('useReadContract')
102+
expect(text).not.toContain('getContract legacy')
103+
expect(text).not.toContain('irrelevant')
104+
})
105+
106+
it('honors an injected provider matcher (no benchmark literal baked in)', () => {
107+
const spans: Span[] = [
108+
{
109+
...base,
110+
spanId: 's1',
111+
kind: 'tool',
112+
toolName: 'youcom',
113+
args: {},
114+
result: 'surfaced fact',
115+
},
116+
]
117+
const isProviderTool = (name: string) => name === 'youcom'
118+
expect(extractRetrievedText(spans, { isProviderTool })).toContain('surfaced fact')
119+
// default matcher would NOT pick up 'youcom'
120+
expect(extractRetrievedText(spans)).toBe('')
121+
})
122+
123+
it('default matcher accepts search/research and rejects fetch', () => {
124+
expect(defaultProviderToolMatcher('web_search')).toBe(true)
125+
expect(defaultProviderToolMatcher('deep_research')).toBe(true)
126+
expect(defaultProviderToolMatcher('fetch_url')).toBe(false)
127+
expect(defaultProviderToolMatcher('read_file')).toBe(false)
128+
})
129+
})
130+
131+
describe('scoreGroundednessForRun', () => {
132+
it('extracts provider text from spans then scores it in one call', () => {
133+
const base = { runId: 'r1', name: 'n', startedAt: 0 } as const
134+
const spans: Span[] = [
135+
{
136+
...base,
137+
spanId: 's1',
138+
kind: 'retrieval',
139+
query: 'wagmi v2',
140+
hits: [{ docId: 'd1', score: 0.9, content: 'useReadContract and useWriteContract' }],
141+
},
142+
]
143+
const r = scoreGroundednessForRun(spans, [
144+
'useReadContract',
145+
'useWriteContract',
146+
'useContractRead',
147+
])
148+
expect(r.found.sort()).toEqual(['useReadContract', 'useWriteContract'])
149+
expect(r.missing).toEqual(['useContractRead'])
150+
expect(r.score).toBeCloseTo(2 / 3)
151+
})
152+
})

src/groundedness/index.ts

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/**
2+
* Groundedness — "did the retrieval PROVIDER surface what the task needed?"
3+
*
4+
* A search/research provider returns text; the task needed certain facts or
5+
* symbols to be solvable (the CURRENT API, a version number, a function name).
6+
* This module scores how much of that required knowledge the provider's results
7+
* actually surfaced — isolating PROVIDER quality (was the right thing
8+
* retrievable / returned) from AGENT skill (did the agent then use it). A high
9+
* groundedness score with a failed run blames the agent; a low score blames the
10+
* provider. That separation is the whole point — pass/fail alone cannot make it.
11+
*
12+
* Structural sibling of `../authenticity`:
13+
* - authenticity scores the agent's PRODUCED files for realness.
14+
* - groundedness scores the provider's RETRIEVED text for coverage.
15+
* Both are pure deterministic scorers whose DOMAIN config is supplied by the
16+
* consumer (authenticity: `AuthenticitySignals`; groundedness:
17+
* `requiredKnowledge: string[]`) — neither bakes in a benchmark's vocabulary.
18+
*
19+
* Relationship to `keyword-coverage-judge`: that judge scores the agent's
20+
* SERVED OUTPUT (HTML + assets) for expected concepts — a different input
21+
* (produced deliverable) answering a different question (deliverable quality).
22+
* Groundedness reads the RETRIEVAL side (provider results). They are
23+
* complementary coverage scorers over different stages of the run, not
24+
* duplicates; do not collapse one into the other.
25+
*
26+
* Two seams, neither forked:
27+
* - PURE SCORER `scoreGroundedness(resultText, requiredKnowledge)` — case-
28+
* insensitive substring containment over a deduped key set. Fail-open: with
29+
* no required knowledge there is nothing to ground, so `score = 1`.
30+
* - TRACE EXTRACTOR `extractRetrievedText(spans, opts?)` — pulls the provider's
31+
* returned text out of the canonical `TraceSchema` spans (`RetrievalSpan.hits`
32+
* + provider `ToolSpan.result`) instead of re-parsing bespoke run files. This
33+
* is the retrieval-side analog of `extractProducedState` (events → produced
34+
* files): structural span input, no IO, no disk walking.
35+
*/
36+
37+
import type { RetrievalSpan, Span, ToolSpan } from '../trace/schema'
38+
import { isRetrievalSpan, isToolSpan } from '../trace/schema'
39+
40+
// ── Pure scorer ──────────────────────────────────────────────────────────────
41+
42+
export interface GroundednessResult {
43+
/** 0..1 share of required knowledge surfaced by the provider's results.
44+
* 1 when there is nothing to ground (`requiredKnowledge` empty) — fail-open. */
45+
score: number
46+
/** The required-knowledge keys the result text surfaced (deduped, original casing). */
47+
found: string[]
48+
/** The required-knowledge keys the result text did NOT surface. */
49+
missing: string[]
50+
/** Distinct required-knowledge keys after dedup — the denominator of `score`. */
51+
total: number
52+
/** Did the provider return any result text at all? Distinguishes "provider
53+
* surfaced nothing" (`!hadResults`) from "returned text but missed the facts"
54+
* (`hadResults && score < 1`) — the same provider-vs-agent split as the score. */
55+
hadResults: boolean
56+
}
57+
58+
/**
59+
* Dedup a knowledge-key list, case-insensitively, keeping first-seen casing and
60+
* dropping blanks. The score denominator is distinct keys, so a config that
61+
* lists the same symbol twice (or with different casing) can't inflate `total`.
62+
*/
63+
function dedupeKeys(keys: readonly string[]): string[] {
64+
const seen = new Set<string>()
65+
const out: string[] = []
66+
for (const raw of keys) {
67+
const k = raw.trim()
68+
if (!k) continue
69+
const lower = k.toLowerCase()
70+
if (seen.has(lower)) continue
71+
seen.add(lower)
72+
out.push(k)
73+
}
74+
return out
75+
}
76+
77+
/**
78+
* Score how much of `requiredKnowledge` the retrieval provider's `resultText`
79+
* surfaced. Pure — same inputs, same output. No IO, no LLM.
80+
*
81+
* Matching is case-insensitive substring containment: each required key is
82+
* checked against the lower-cased result text. This is intentionally the same
83+
* cheap, deterministic containment the authenticity scorer uses for its
84+
* structural signals — a key is "surfaced" if the provider's returned text
85+
* mentions it. Semantic / paraphrase coverage is a separate (LLM) layer a
86+
* consumer can stack on top, exactly as authenticity stacks its nuance judge.
87+
*
88+
* Fail-open at `total === 0`: a task with no required knowledge has nothing for
89+
* the provider to ground, so it cannot be penalized (`score = 1`). The benchmark
90+
* caller decides what `requiredKnowledge` is — the substrate never derives it.
91+
*/
92+
export function scoreGroundedness(
93+
resultText: string,
94+
requiredKnowledge: readonly string[],
95+
): GroundednessResult {
96+
const keys = dedupeKeys(requiredKnowledge)
97+
const total = keys.length
98+
const text = resultText ?? ''
99+
const hadResults = text.trim().length > 0
100+
const haystack = text.toLowerCase()
101+
102+
if (total === 0) {
103+
return { score: 1, found: [], missing: [], total: 0, hadResults }
104+
}
105+
106+
const found: string[] = []
107+
const missing: string[] = []
108+
for (const key of keys) {
109+
if (haystack.includes(key.toLowerCase())) found.push(key)
110+
else missing.push(key)
111+
}
112+
113+
return { score: found.length / total, found, missing, total, hadResults }
114+
}
115+
116+
// ── Trace extractor ────────────────────────────────────────────────────────
117+
118+
/**
119+
* Predicate selecting which `ToolSpan`s are retrieval-PROVIDER calls (whose
120+
* `result` carries returned text), by tool name. A parameter — never a baked-in
121+
* literal — so the substrate stays free of any one benchmark's tool vocabulary,
122+
* exactly as `AuthenticitySignals` keeps all domain regexes consumer-supplied.
123+
*/
124+
export type ProviderToolMatcher = (toolName: string) => boolean
125+
126+
/**
127+
* Default provider matcher: tool names that look like search/research but not a
128+
* plain fetch/read. A sensible starting point for the common "search arm" shape;
129+
* any consumer with different tool names passes its own matcher. `RetrievalSpan`s
130+
* are ALWAYS included regardless of this matcher (they are retrieval by kind);
131+
* the matcher only selects which generic `ToolSpan`s also count as provider calls.
132+
*/
133+
export const defaultProviderToolMatcher: ProviderToolMatcher = (name) =>
134+
/search|research/i.test(name) && !/fetch/i.test(name)
135+
136+
export interface ExtractRetrievedTextOptions {
137+
/** Which `ToolSpan`s count as provider calls. Default: {@link defaultProviderToolMatcher}. */
138+
isProviderTool?: ProviderToolMatcher
139+
}
140+
141+
/** Stringify a `ToolSpan.result` of unknown shape into searchable text. */
142+
function resultToText(result: unknown): string {
143+
if (result == null) return ''
144+
if (typeof result === 'string') return result
145+
try {
146+
return JSON.stringify(result)
147+
} catch {
148+
return String(result)
149+
}
150+
}
151+
152+
/** Pull the retrieved text out of a `RetrievalSpan`: every hit's `content`. */
153+
function retrievalSpanText(span: RetrievalSpan): string {
154+
return span.hits
155+
.map((h) => h.content ?? '')
156+
.filter((c) => c.length > 0)
157+
.join('\n')
158+
}
159+
160+
/**
161+
* Extract the retrieval PROVIDER's returned text from a span stream — the
162+
* retrieval-side analog of `extractProducedState`. Reads the canonical
163+
* `TraceSchema` carriers, NOT bespoke run files:
164+
* - every `RetrievalSpan`'s `hits[].content` (kind 'retrieval' — the
165+
* substrate's first-class search/research result carrier; the same `.hits`
166+
* the `bad_retrieval` failure detector already reads), and
167+
* - `ToolSpan.result` for tool spans whose `toolName` the provider matcher
168+
* accepts (kind 'tool').
169+
*
170+
* Pure and total: spans of other kinds, and provider tools with no result, are
171+
* skipped. Returns one text blob ready for `scoreGroundedness`.
172+
*/
173+
export function extractRetrievedText(
174+
spans: readonly Span[],
175+
opts: ExtractRetrievedTextOptions = {},
176+
): string {
177+
const isProviderTool = opts.isProviderTool ?? defaultProviderToolMatcher
178+
const parts: string[] = []
179+
for (const span of spans) {
180+
if (isRetrievalSpan(span)) {
181+
const t = retrievalSpanText(span)
182+
if (t) parts.push(t)
183+
} else if (isToolSpan(span)) {
184+
const ts = span as ToolSpan
185+
if (isProviderTool(ts.toolName)) {
186+
const t = resultToText(ts.result)
187+
if (t) parts.push(t)
188+
}
189+
}
190+
}
191+
return parts.join('\n')
192+
}
193+
194+
// ── Convenience: extract-then-score ───────────────────────────────────────────
195+
196+
/**
197+
* Extract the provider's retrieved text from a run's spans and score it against
198+
* `requiredKnowledge` in one call — the analog of authenticity's file-in
199+
* convenience. The primary contract is the standalone `scoreGroundedness`; this
200+
* is the ergonomic path for a consumer holding a persisted run's `Span[]`
201+
* (e.g. from `TraceStore.spans(...)`).
202+
*/
203+
export function scoreGroundednessForRun(
204+
spans: readonly Span[],
205+
requiredKnowledge: readonly string[],
206+
opts: ExtractRetrievedTextOptions = {},
207+
): GroundednessResult {
208+
return scoreGroundedness(extractRetrievedText(spans, opts), requiredKnowledge)
209+
}

tsup.config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export default defineConfig({
2626
'campaign/index': 'src/campaign/index.ts',
2727
'storyboard/index': 'src/storyboard/index.ts',
2828
'authenticity/index': 'src/authenticity/index.ts',
29+
'groundedness/index': 'src/groundedness/index.ts',
2930
'belief-state/index': 'src/belief-state/index.ts',
3031
'workflow/index': 'src/workflow/index.ts',
3132
'contract/index': 'src/contract/index.ts',

0 commit comments

Comments
 (0)