-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.ts
More file actions
79 lines (75 loc) · 4.06 KB
/
Copy pathindex.ts
File metadata and controls
79 lines (75 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/**
* Eval — the app-shell BRIDGE to `@tangle-network/agent-eval`, not a reimpl.
*
* The completion/scoring ENGINE lives in agent-eval (a peer dependency):
* `verifyCompletion`, `extractProducedState`, `weightedComposite`,
* `createLlmCorrectnessChecker`, and the `CompletionRequirement` / `TaskGold` /
* `ProducedState` types — all re-exported here so a consumer has one import
* root. This module adds only what agent-eval doesn't have and what is
* app-shell-specific:
*
* 1. {@link producedFromToolEvents} — the bridge: turn the structured app-tool
* side channel's `AppToolProducedEvent`s (from a tool runtime executor's
* `onProduced`) into the `RuntimeEventLike`s agent-eval's
* `extractProducedState` consumes. This is the one piece that knows about
* the app-tool channel, so it belongs here, not in the engine.
* 2. {@link createTokenRecallChecker} — a deterministic, no-LLM
* `CorrectnessChecker` (agent-eval ships only the LLM one). For apps/tests
* that gate completion without a judge call.
*
* Full campaigns (persona simulation, traces, scorecards, held-out gates) are
* agent-eval's `runEvalCampaign` / `AgentDriver` / `BenchmarkRunner` — use them
* directly; this module composes with them.
*/
import type { RuntimeEventLike, CompletionRequirement } from '@tangle-network/agent-eval'
import type { AppToolProducedEvent } from '../tools/types'
// Re-export the engine so consumers import completion + scoring from one place.
export { verifyCompletion, extractProducedState, weightedComposite, createLlmCorrectnessChecker } from '@tangle-network/agent-eval'
export type {
CompletionRequirement,
TaskGold,
ProducedState,
SatisfiedBy,
CompletionVerdict,
CorrectnessChecker,
RuntimeEventLike,
} from '@tangle-network/agent-eval'
/**
* Bridge the app-tool side channel's produced events into the runtime-event
* shape agent-eval's `extractProducedState` reads. Pipe it:
* `verifyCompletion(taskGold, extractProducedState(producedFromToolEvents(events)), checker)`
*/
export function producedFromToolEvents(events: readonly AppToolProducedEvent[]): RuntimeEventLike[] {
return events.map((e) =>
e.type === 'proposal_created'
? { type: 'proposal_created', proposalId: e.proposalId, title: e.title, status: e.status, content: e.content }
: { type: 'artifact', artifactId: `vault:${e.path}`, name: e.path, uri: `vault://${e.path}`, mimeType: 'text/markdown', content: e.content },
)
}
const STOPWORDS = new Set(['the', 'a', 'an', 'and', 'or', 'for', 'to', 'of', 'in', 'on', 'with', 'review', 'update', 'new', 'proposed'])
/**
* A deterministic `CorrectnessChecker` (agent-eval exports only
* `createLlmCorrectnessChecker`). A produced item fulfils a requirement when
* its content is substantive and recalls ≥ `minRecall` of the requirement
* title's significant tokens. No network — the default gate for apps/tests
* without an LLM judge. Pass to `verifyCompletion` as the checker.
*/
export function createTokenRecallChecker(opts: { minRecall?: number; minContentLength?: number } = {}): (
requirement: CompletionRequirement,
content: string,
) => Promise<{ correct: boolean; reason: string }> {
const minRecall = opts.minRecall ?? 0.5
const minLen = opts.minContentLength ?? 120
return async (requirement, content) => {
const body = content.trim()
if (body.length < minLen) return { correct: false, reason: `content too thin (${body.length} chars) to be the deliverable` }
const tokens = requirement.title.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 2 && !STOPWORDS.has(t))
if (tokens.length === 0) return { correct: true, reason: 'requirement title has no significant tokens — structural match accepted' }
const lower = body.toLowerCase()
const hits = tokens.filter((t) => lower.includes(t)).length
const recall = hits / tokens.length
return recall >= minRecall
? { correct: true, reason: `content recalls ${hits}/${tokens.length} requirement tokens` }
: { correct: false, reason: `content recalls only ${hits}/${tokens.length} requirement tokens` }
}
}