feat(groundedness): retrieval-quality scorer (#282)

drewstone · web-flow · commit 9abe3b4ef90c · 2026-06-24T15:35:03.000+03:00
Pure scoreGroundedness(resultText, requiredKnowledge[]) -&gt; {score,found,
missing,total,hadResults} plus a span-based extractRetrievedText over the
canonical TraceSchema (RetrievalSpan.hits + provider ToolSpan.result),
the structural sibling of src/authenticity. Provider-tool selection is an
injected matcher (default search/research-not-fetch), not a baked literal;
requiredKnowledge is a bare string[] supplied by the consumer. Subpath-only
export (./groundedness), no root re-export — mirrors authenticity.
diff --git a/package.json b/package.json
@@ -134,6 +134,11 @@
       "import": "./dist/authenticity/index.js",
       "default": "./dist/authenticity/index.js"
     },
+    "./groundedness": {
+      "types": "./dist/groundedness/index.d.ts",
+      "import": "./dist/groundedness/index.js",
+      "default": "./dist/groundedness/index.js"
+    },
     "./belief-state": {
       "types": "./dist/belief-state/index.d.ts",
       "import": "./dist/belief-state/index.js",
diff --git a/src/groundedness/index.test.ts b/src/groundedness/index.test.ts
@@ -0,0 +1,152 @@
+import { describe, expect, it } from 'vitest'
+import type { Span } from '../trace/schema'
+import {
+  defaultProviderToolMatcher,
+  extractRetrievedText,
+  scoreGroundedness,
+  scoreGroundednessForRun,
+} from './index'
+
+describe('scoreGroundedness', () => {
+  it('scores the share of required knowledge the provider surfaced, case-insensitively', () => {
+    const text = 'The current API uses createMiddleware from hono/factory and streamSSE.'
+    const r = scoreGroundedness(text, ['createMiddleware', 'streamSSE', 'getRuntimeKey'])
+    expect(r.total).toBe(3)
+    expect(r.found.sort()).toEqual(['createMiddleware', 'streamSSE'])
+    expect(r.missing).toEqual(['getRuntimeKey'])
+    expect(r.score).toBeCloseTo(2 / 3)
+    expect(r.hadResults).toBe(true)
+  })
+
+  it('matches case-insensitively but reports keys in their original casing', () => {
+    const r = scoreGroundedness('imports * as Z from ZOD', ['z', 'zod'])
+    expect(r.found).toEqual(['z', 'zod'])
+    expect(r.score).toBe(1)
+  })
+
+  it('dedupes required keys (case-insensitive) so the denominator cannot be inflated', () => {
+    const r = scoreGroundedness('uses viem', ['viem', 'VIEM', ' viem '])
+    expect(r.total).toBe(1)
+    expect(r.score).toBe(1)
+  })
+
+  it('fails open when there is no required knowledge (nothing to ground)', () => {
+    const r = scoreGroundedness('', [])
+    expect(r.score).toBe(1)
+    expect(r.total).toBe(0)
+    expect(r.hadResults).toBe(false)
+  })
+
+  it('distinguishes "no results" from "results that missed the facts"', () => {
+    const empty = scoreGroundedness('', ['useReadContract'])
+    expect(empty.hadResults).toBe(false)
+    expect(empty.score).toBe(0)
+
+    const missed = scoreGroundedness('here is some unrelated prose', ['useReadContract'])
+    expect(missed.hadResults).toBe(true)
+    expect(missed.score).toBe(0)
+  })
+})
+
+describe('extractRetrievedText', () => {
+  const base = { runId: 'r1', name: 'n', startedAt: 0 } as const
+
+  it('reads RetrievalSpan hit content', () => {
+    const spans: Span[] = [
+      {
+        ...base,
+        spanId: 's1',
+        kind: 'retrieval',
+        query: 'hono factory',
+        hits: [
+          { docId: 'd1', score: 0.9, content: 'createMiddleware from hono/factory' },
+          { docId: 'd2', score: 0.4, content: 'streamSSE from hono/streaming' },
+          { docId: 'd3', score: 0.1 }, // no content — skipped, not crashed
+        ],
+      },
+    ]
+    const text = extractRetrievedText(spans)
+    expect(text).toContain('createMiddleware')
+    expect(text).toContain('streamSSE')
+  })
+
+  it('reads provider ToolSpan results by the default matcher, skipping fetch + non-provider tools', () => {
+    const spans: Span[] = [
+      {
+        ...base,
+        spanId: 's1',
+        kind: 'tool',
+        toolName: 'web_search',
+        args: { q: 'viem v2' },
+        result: { snippets: ['useReadContract is current'] },
+      },
+      {
+        ...base,
+        spanId: 's2',
+        kind: 'tool',
+        toolName: 'fetch_url', // search/research-not-fetch default excludes this
+        args: {},
+        result: 'getContract legacy',
+      },
+      {
+        ...base,
+        spanId: 's3',
+        kind: 'tool',
+        toolName: 'write_file', // not a provider tool at all
+        args: {},
+        result: 'irrelevant',
+      },
+    ]
+    const text = extractRetrievedText(spans)
+    expect(text).toContain('useReadContract')
+    expect(text).not.toContain('getContract legacy')
+    expect(text).not.toContain('irrelevant')
+  })
+
+  it('honors an injected provider matcher (no benchmark literal baked in)', () => {
+    const spans: Span[] = [
+      {
+        ...base,
+        spanId: 's1',
+        kind: 'tool',
+        toolName: 'youcom',
+        args: {},
+        result: 'surfaced fact',
+      },
+    ]
+    const isProviderTool = (name: string) => name === 'youcom'
+    expect(extractRetrievedText(spans, { isProviderTool })).toContain('surfaced fact')
+    // default matcher would NOT pick up 'youcom'
+    expect(extractRetrievedText(spans)).toBe('')
+  })
+
+  it('default matcher accepts search/research and rejects fetch', () => {
+    expect(defaultProviderToolMatcher('web_search')).toBe(true)
+    expect(defaultProviderToolMatcher('deep_research')).toBe(true)
+    expect(defaultProviderToolMatcher('fetch_url')).toBe(false)
+    expect(defaultProviderToolMatcher('read_file')).toBe(false)
+  })
+})
+
+describe('scoreGroundednessForRun', () => {
+  it('extracts provider text from spans then scores it in one call', () => {
+    const base = { runId: 'r1', name: 'n', startedAt: 0 } as const
+    const spans: Span[] = [
+      {
+        ...base,
+        spanId: 's1',
+        kind: 'retrieval',
+        query: 'wagmi v2',
+        hits: [{ docId: 'd1', score: 0.9, content: 'useReadContract and useWriteContract' }],
+      },
+    ]
+    const r = scoreGroundednessForRun(spans, [
+      'useReadContract',
+      'useWriteContract',
+      'useContractRead',
+    ])
+    expect(r.found.sort()).toEqual(['useReadContract', 'useWriteContract'])
+    expect(r.missing).toEqual(['useContractRead'])
+    expect(r.score).toBeCloseTo(2 / 3)
+  })
+})
diff --git a/src/groundedness/index.ts b/src/groundedness/index.ts
@@ -0,0 +1,209 @@
+/**
+ * Groundedness — "did the retrieval PROVIDER surface what the task needed?"
+ *
+ * A search/research provider returns text; the task needed certain facts or
+ * symbols to be solvable (the CURRENT API, a version number, a function name).
+ * This module scores how much of that required knowledge the provider's results
+ * actually surfaced — isolating PROVIDER quality (was the right thing
+ * retrievable / returned) from AGENT skill (did the agent then use it). A high
+ * groundedness score with a failed run blames the agent; a low score blames the
+ * provider. That separation is the whole point — pass/fail alone cannot make it.
+ *
+ * Structural sibling of `../authenticity`:
+ *   - authenticity scores the agent's PRODUCED files for realness.
+ *   - groundedness scores the provider's RETRIEVED text for coverage.
+ *   Both are pure deterministic scorers whose DOMAIN config is supplied by the
+ *   consumer (authenticity: `AuthenticitySignals`; groundedness:
+ *   `requiredKnowledge: string[]`) — neither bakes in a benchmark's vocabulary.
+ *
+ * Relationship to `keyword-coverage-judge`: that judge scores the agent's
+ * SERVED OUTPUT (HTML + assets) for expected concepts — a different input
+ * (produced deliverable) answering a different question (deliverable quality).
+ * Groundedness reads the RETRIEVAL side (provider results). They are
+ * complementary coverage scorers over different stages of the run, not
+ * duplicates; do not collapse one into the other.
+ *
+ * Two seams, neither forked:
+ *   - PURE SCORER `scoreGroundedness(resultText, requiredKnowledge)` — case-
+ *     insensitive substring containment over a deduped key set. Fail-open: with
+ *     no required knowledge there is nothing to ground, so `score = 1`.
+ *   - TRACE EXTRACTOR `extractRetrievedText(spans, opts?)` — pulls the provider's
+ *     returned text out of the canonical `TraceSchema` spans (`RetrievalSpan.hits`
+ *     + provider `ToolSpan.result`) instead of re-parsing bespoke run files. This
+ *     is the retrieval-side analog of `extractProducedState` (events → produced
+ *     files): structural span input, no IO, no disk walking.
+ */
+
+import type { RetrievalSpan, Span, ToolSpan } from '../trace/schema'
+import { isRetrievalSpan, isToolSpan } from '../trace/schema'
+
+// ── Pure scorer ──────────────────────────────────────────────────────────────
+
+export interface GroundednessResult {
+  /** 0..1 share of required knowledge surfaced by the provider's results.
+   *  1 when there is nothing to ground (`requiredKnowledge` empty) — fail-open. */
+  score: number
+  /** The required-knowledge keys the result text surfaced (deduped, original casing). */
+  found: string[]
+  /** The required-knowledge keys the result text did NOT surface. */
+  missing: string[]
+  /** Distinct required-knowledge keys after dedup — the denominator of `score`. */
+  total: number
+  /** Did the provider return any result text at all? Distinguishes "provider
+   *  surfaced nothing" (`!hadResults`) from "returned text but missed the facts"
+   *  (`hadResults && score < 1`) — the same provider-vs-agent split as the score. */
+  hadResults: boolean
+}
+
+/**
+ * Dedup a knowledge-key list, case-insensitively, keeping first-seen casing and
+ * dropping blanks. The score denominator is distinct keys, so a config that
+ * lists the same symbol twice (or with different casing) can't inflate `total`.
+ */
+function dedupeKeys(keys: readonly string[]): string[] {
+  const seen = new Set<string>()
+  const out: string[] = []
+  for (const raw of keys) {
+    const k = raw.trim()
+    if (!k) continue
+    const lower = k.toLowerCase()
+    if (seen.has(lower)) continue
+    seen.add(lower)
+    out.push(k)
+  }
+  return out
+}
+
+/**
+ * Score how much of `requiredKnowledge` the retrieval provider's `resultText`
+ * surfaced. Pure — same inputs, same output. No IO, no LLM.
+ *
+ * Matching is case-insensitive substring containment: each required key is
+ * checked against the lower-cased result text. This is intentionally the same
+ * cheap, deterministic containment the authenticity scorer uses for its
+ * structural signals — a key is "surfaced" if the provider's returned text
+ * mentions it. Semantic / paraphrase coverage is a separate (LLM) layer a
+ * consumer can stack on top, exactly as authenticity stacks its nuance judge.
+ *
+ * Fail-open at `total === 0`: a task with no required knowledge has nothing for
+ * the provider to ground, so it cannot be penalized (`score = 1`). The benchmark
+ * caller decides what `requiredKnowledge` is — the substrate never derives it.
+ */
+export function scoreGroundedness(
+  resultText: string,
+  requiredKnowledge: readonly string[],
+): GroundednessResult {
+  const keys = dedupeKeys(requiredKnowledge)
+  const total = keys.length
+  const text = resultText ?? ''
+  const hadResults = text.trim().length > 0
+  const haystack = text.toLowerCase()
+
+  if (total === 0) {
+    return { score: 1, found: [], missing: [], total: 0, hadResults }
+  }
+
+  const found: string[] = []
+  const missing: string[] = []
+  for (const key of keys) {
+    if (haystack.includes(key.toLowerCase())) found.push(key)
+    else missing.push(key)
+  }
+
+  return { score: found.length / total, found, missing, total, hadResults }
+}
+
+// ── Trace extractor ────────────────────────────────────────────────────────
+
+/**
+ * Predicate selecting which `ToolSpan`s are retrieval-PROVIDER calls (whose
+ * `result` carries returned text), by tool name. A parameter — never a baked-in
+ * literal — so the substrate stays free of any one benchmark's tool vocabulary,
+ * exactly as `AuthenticitySignals` keeps all domain regexes consumer-supplied.
+ */
+export type ProviderToolMatcher = (toolName: string) => boolean
+
+/**
+ * Default provider matcher: tool names that look like search/research but not a
+ * plain fetch/read. A sensible starting point for the common "search arm" shape;
+ * any consumer with different tool names passes its own matcher. `RetrievalSpan`s
+ * are ALWAYS included regardless of this matcher (they are retrieval by kind);
+ * the matcher only selects which generic `ToolSpan`s also count as provider calls.
+ */
+export const defaultProviderToolMatcher: ProviderToolMatcher = (name) =>
+  /search|research/i.test(name) && !/fetch/i.test(name)
+
+export interface ExtractRetrievedTextOptions {
+  /** Which `ToolSpan`s count as provider calls. Default: {@link defaultProviderToolMatcher}. */
+  isProviderTool?: ProviderToolMatcher
+}
+
+/** Stringify a `ToolSpan.result` of unknown shape into searchable text. */
+function resultToText(result: unknown): string {
+  if (result == null) return ''
+  if (typeof result === 'string') return result
+  try {
+    return JSON.stringify(result)
+  } catch {
+    return String(result)
+  }
+}
+
+/** Pull the retrieved text out of a `RetrievalSpan`: every hit's `content`. */
+function retrievalSpanText(span: RetrievalSpan): string {
+  return span.hits
+    .map((h) => h.content ?? '')
+    .filter((c) => c.length > 0)
+    .join('\n')
+}
+
+/**
+ * Extract the retrieval PROVIDER's returned text from a span stream — the
+ * retrieval-side analog of `extractProducedState`. Reads the canonical
+ * `TraceSchema` carriers, NOT bespoke run files:
+ *   - every `RetrievalSpan`'s `hits[].content` (kind 'retrieval' — the
+ *     substrate's first-class search/research result carrier; the same `.hits`
+ *     the `bad_retrieval` failure detector already reads), and
+ *   - `ToolSpan.result` for tool spans whose `toolName` the provider matcher
+ *     accepts (kind 'tool').
+ *
+ * Pure and total: spans of other kinds, and provider tools with no result, are
+ * skipped. Returns one text blob ready for `scoreGroundedness`.
+ */
+export function extractRetrievedText(
+  spans: readonly Span[],
+  opts: ExtractRetrievedTextOptions = {},
+): string {
+  const isProviderTool = opts.isProviderTool ?? defaultProviderToolMatcher
+  const parts: string[] = []
+  for (const span of spans) {
+    if (isRetrievalSpan(span)) {
+      const t = retrievalSpanText(span)
+      if (t) parts.push(t)
+    } else if (isToolSpan(span)) {
+      const ts = span as ToolSpan
+      if (isProviderTool(ts.toolName)) {
+        const t = resultToText(ts.result)
+        if (t) parts.push(t)
+      }
+    }
+  }
+  return parts.join('\n')
+}
+
+// ── Convenience: extract-then-score ───────────────────────────────────────────
+
+/**
+ * Extract the provider's retrieved text from a run's spans and score it against
+ * `requiredKnowledge` in one call — the analog of authenticity's file-in
+ * convenience. The primary contract is the standalone `scoreGroundedness`; this
+ * is the ergonomic path for a consumer holding a persisted run's `Span[]`
+ * (e.g. from `TraceStore.spans(...)`).
+ */
+export function scoreGroundednessForRun(
+  spans: readonly Span[],
+  requiredKnowledge: readonly string[],
+  opts: ExtractRetrievedTextOptions = {},
+): GroundednessResult {
+  return scoreGroundedness(extractRetrievedText(spans, opts), requiredKnowledge)
+}
diff --git a/tsup.config.ts b/tsup.config.ts
@@ -26,6 +26,7 @@ export default defineConfig({
     'campaign/index': 'src/campaign/index.ts',
     'storyboard/index': 'src/storyboard/index.ts',
     'authenticity/index': 'src/authenticity/index.ts',
+    'groundedness/index': 'src/groundedness/index.ts',
     'belief-state/index': 'src/belief-state/index.ts',
     'workflow/index': 'src/workflow/index.ts',
     'contract/index': 'src/contract/index.ts',