|
| 1 | +/** |
| 2 | + * Groundedness — "did the retrieval PROVIDER surface what the task needed?" |
| 3 | + * |
| 4 | + * A search/research provider returns text; the task needed certain facts or |
| 5 | + * symbols to be solvable (the CURRENT API, a version number, a function name). |
| 6 | + * This module scores how much of that required knowledge the provider's results |
| 7 | + * actually surfaced — isolating PROVIDER quality (was the right thing |
| 8 | + * retrievable / returned) from AGENT skill (did the agent then use it). A high |
| 9 | + * groundedness score with a failed run blames the agent; a low score blames the |
| 10 | + * provider. That separation is the whole point — pass/fail alone cannot make it. |
| 11 | + * |
| 12 | + * Structural sibling of `../authenticity`: |
| 13 | + * - authenticity scores the agent's PRODUCED files for realness. |
| 14 | + * - groundedness scores the provider's RETRIEVED text for coverage. |
| 15 | + * Both are pure deterministic scorers whose DOMAIN config is supplied by the |
| 16 | + * consumer (authenticity: `AuthenticitySignals`; groundedness: |
| 17 | + * `requiredKnowledge: string[]`) — neither bakes in a benchmark's vocabulary. |
| 18 | + * |
| 19 | + * Relationship to `keyword-coverage-judge`: that judge scores the agent's |
| 20 | + * SERVED OUTPUT (HTML + assets) for expected concepts — a different input |
| 21 | + * (produced deliverable) answering a different question (deliverable quality). |
| 22 | + * Groundedness reads the RETRIEVAL side (provider results). They are |
| 23 | + * complementary coverage scorers over different stages of the run, not |
| 24 | + * duplicates; do not collapse one into the other. |
| 25 | + * |
| 26 | + * Two seams, neither forked: |
| 27 | + * - PURE SCORER `scoreGroundedness(resultText, requiredKnowledge)` — case- |
| 28 | + * insensitive substring containment over a deduped key set. Fail-open: with |
| 29 | + * no required knowledge there is nothing to ground, so `score = 1`. |
| 30 | + * - TRACE EXTRACTOR `extractRetrievedText(spans, opts?)` — pulls the provider's |
| 31 | + * returned text out of the canonical `TraceSchema` spans (`RetrievalSpan.hits` |
| 32 | + * + provider `ToolSpan.result`) instead of re-parsing bespoke run files. This |
| 33 | + * is the retrieval-side analog of `extractProducedState` (events → produced |
| 34 | + * files): structural span input, no IO, no disk walking. |
| 35 | + */ |
| 36 | + |
| 37 | +import type { RetrievalSpan, Span, ToolSpan } from '../trace/schema' |
| 38 | +import { isRetrievalSpan, isToolSpan } from '../trace/schema' |
| 39 | + |
| 40 | +// ── Pure scorer ────────────────────────────────────────────────────────────── |
| 41 | + |
| 42 | +export interface GroundednessResult { |
| 43 | + /** 0..1 share of required knowledge surfaced by the provider's results. |
| 44 | + * 1 when there is nothing to ground (`requiredKnowledge` empty) — fail-open. */ |
| 45 | + score: number |
| 46 | + /** The required-knowledge keys the result text surfaced (deduped, original casing). */ |
| 47 | + found: string[] |
| 48 | + /** The required-knowledge keys the result text did NOT surface. */ |
| 49 | + missing: string[] |
| 50 | + /** Distinct required-knowledge keys after dedup — the denominator of `score`. */ |
| 51 | + total: number |
| 52 | + /** Did the provider return any result text at all? Distinguishes "provider |
| 53 | + * surfaced nothing" (`!hadResults`) from "returned text but missed the facts" |
| 54 | + * (`hadResults && score < 1`) — the same provider-vs-agent split as the score. */ |
| 55 | + hadResults: boolean |
| 56 | +} |
| 57 | + |
| 58 | +/** |
| 59 | + * Dedup a knowledge-key list, case-insensitively, keeping first-seen casing and |
| 60 | + * dropping blanks. The score denominator is distinct keys, so a config that |
| 61 | + * lists the same symbol twice (or with different casing) can't inflate `total`. |
| 62 | + */ |
| 63 | +function dedupeKeys(keys: readonly string[]): string[] { |
| 64 | + const seen = new Set<string>() |
| 65 | + const out: string[] = [] |
| 66 | + for (const raw of keys) { |
| 67 | + const k = raw.trim() |
| 68 | + if (!k) continue |
| 69 | + const lower = k.toLowerCase() |
| 70 | + if (seen.has(lower)) continue |
| 71 | + seen.add(lower) |
| 72 | + out.push(k) |
| 73 | + } |
| 74 | + return out |
| 75 | +} |
| 76 | + |
| 77 | +/** |
| 78 | + * Score how much of `requiredKnowledge` the retrieval provider's `resultText` |
| 79 | + * surfaced. Pure — same inputs, same output. No IO, no LLM. |
| 80 | + * |
| 81 | + * Matching is case-insensitive substring containment: each required key is |
| 82 | + * checked against the lower-cased result text. This is intentionally the same |
| 83 | + * cheap, deterministic containment the authenticity scorer uses for its |
| 84 | + * structural signals — a key is "surfaced" if the provider's returned text |
| 85 | + * mentions it. Semantic / paraphrase coverage is a separate (LLM) layer a |
| 86 | + * consumer can stack on top, exactly as authenticity stacks its nuance judge. |
| 87 | + * |
| 88 | + * Fail-open at `total === 0`: a task with no required knowledge has nothing for |
| 89 | + * the provider to ground, so it cannot be penalized (`score = 1`). The benchmark |
| 90 | + * caller decides what `requiredKnowledge` is — the substrate never derives it. |
| 91 | + */ |
| 92 | +export function scoreGroundedness( |
| 93 | + resultText: string, |
| 94 | + requiredKnowledge: readonly string[], |
| 95 | +): GroundednessResult { |
| 96 | + const keys = dedupeKeys(requiredKnowledge) |
| 97 | + const total = keys.length |
| 98 | + const text = resultText ?? '' |
| 99 | + const hadResults = text.trim().length > 0 |
| 100 | + const haystack = text.toLowerCase() |
| 101 | + |
| 102 | + if (total === 0) { |
| 103 | + return { score: 1, found: [], missing: [], total: 0, hadResults } |
| 104 | + } |
| 105 | + |
| 106 | + const found: string[] = [] |
| 107 | + const missing: string[] = [] |
| 108 | + for (const key of keys) { |
| 109 | + if (haystack.includes(key.toLowerCase())) found.push(key) |
| 110 | + else missing.push(key) |
| 111 | + } |
| 112 | + |
| 113 | + return { score: found.length / total, found, missing, total, hadResults } |
| 114 | +} |
| 115 | + |
| 116 | +// ── Trace extractor ──────────────────────────────────────────────────────── |
| 117 | + |
| 118 | +/** |
| 119 | + * Predicate selecting which `ToolSpan`s are retrieval-PROVIDER calls (whose |
| 120 | + * `result` carries returned text), by tool name. A parameter — never a baked-in |
| 121 | + * literal — so the substrate stays free of any one benchmark's tool vocabulary, |
| 122 | + * exactly as `AuthenticitySignals` keeps all domain regexes consumer-supplied. |
| 123 | + */ |
| 124 | +export type ProviderToolMatcher = (toolName: string) => boolean |
| 125 | + |
| 126 | +/** |
| 127 | + * Default provider matcher: tool names that look like search/research but not a |
| 128 | + * plain fetch/read. A sensible starting point for the common "search arm" shape; |
| 129 | + * any consumer with different tool names passes its own matcher. `RetrievalSpan`s |
| 130 | + * are ALWAYS included regardless of this matcher (they are retrieval by kind); |
| 131 | + * the matcher only selects which generic `ToolSpan`s also count as provider calls. |
| 132 | + */ |
| 133 | +export const defaultProviderToolMatcher: ProviderToolMatcher = (name) => |
| 134 | + /search|research/i.test(name) && !/fetch/i.test(name) |
| 135 | + |
| 136 | +export interface ExtractRetrievedTextOptions { |
| 137 | + /** Which `ToolSpan`s count as provider calls. Default: {@link defaultProviderToolMatcher}. */ |
| 138 | + isProviderTool?: ProviderToolMatcher |
| 139 | +} |
| 140 | + |
| 141 | +/** Stringify a `ToolSpan.result` of unknown shape into searchable text. */ |
| 142 | +function resultToText(result: unknown): string { |
| 143 | + if (result == null) return '' |
| 144 | + if (typeof result === 'string') return result |
| 145 | + try { |
| 146 | + return JSON.stringify(result) |
| 147 | + } catch { |
| 148 | + return String(result) |
| 149 | + } |
| 150 | +} |
| 151 | + |
| 152 | +/** Pull the retrieved text out of a `RetrievalSpan`: every hit's `content`. */ |
| 153 | +function retrievalSpanText(span: RetrievalSpan): string { |
| 154 | + return span.hits |
| 155 | + .map((h) => h.content ?? '') |
| 156 | + .filter((c) => c.length > 0) |
| 157 | + .join('\n') |
| 158 | +} |
| 159 | + |
| 160 | +/** |
| 161 | + * Extract the retrieval PROVIDER's returned text from a span stream — the |
| 162 | + * retrieval-side analog of `extractProducedState`. Reads the canonical |
| 163 | + * `TraceSchema` carriers, NOT bespoke run files: |
| 164 | + * - every `RetrievalSpan`'s `hits[].content` (kind 'retrieval' — the |
| 165 | + * substrate's first-class search/research result carrier; the same `.hits` |
| 166 | + * the `bad_retrieval` failure detector already reads), and |
| 167 | + * - `ToolSpan.result` for tool spans whose `toolName` the provider matcher |
| 168 | + * accepts (kind 'tool'). |
| 169 | + * |
| 170 | + * Pure and total: spans of other kinds, and provider tools with no result, are |
| 171 | + * skipped. Returns one text blob ready for `scoreGroundedness`. |
| 172 | + */ |
| 173 | +export function extractRetrievedText( |
| 174 | + spans: readonly Span[], |
| 175 | + opts: ExtractRetrievedTextOptions = {}, |
| 176 | +): string { |
| 177 | + const isProviderTool = opts.isProviderTool ?? defaultProviderToolMatcher |
| 178 | + const parts: string[] = [] |
| 179 | + for (const span of spans) { |
| 180 | + if (isRetrievalSpan(span)) { |
| 181 | + const t = retrievalSpanText(span) |
| 182 | + if (t) parts.push(t) |
| 183 | + } else if (isToolSpan(span)) { |
| 184 | + const ts = span as ToolSpan |
| 185 | + if (isProviderTool(ts.toolName)) { |
| 186 | + const t = resultToText(ts.result) |
| 187 | + if (t) parts.push(t) |
| 188 | + } |
| 189 | + } |
| 190 | + } |
| 191 | + return parts.join('\n') |
| 192 | +} |
| 193 | + |
| 194 | +// ── Convenience: extract-then-score ─────────────────────────────────────────── |
| 195 | + |
| 196 | +/** |
| 197 | + * Extract the provider's retrieved text from a run's spans and score it against |
| 198 | + * `requiredKnowledge` in one call — the analog of authenticity's file-in |
| 199 | + * convenience. The primary contract is the standalone `scoreGroundedness`; this |
| 200 | + * is the ergonomic path for a consumer holding a persisted run's `Span[]` |
| 201 | + * (e.g. from `TraceStore.spans(...)`). |
| 202 | + */ |
| 203 | +export function scoreGroundednessForRun( |
| 204 | + spans: readonly Span[], |
| 205 | + requiredKnowledge: readonly string[], |
| 206 | + opts: ExtractRetrievedTextOptions = {}, |
| 207 | +): GroundednessResult { |
| 208 | + return scoreGroundedness(extractRetrievedText(spans, opts), requiredKnowledge) |
| 209 | +} |
0 commit comments