Skip to content

Commit 7741f9d

Browse files
authored
fix(eval): reject authored direct input (#1646)
* fix(eval): reject authored direct input * Migrate eval input authoring to prompts vars * fix(eval): reject mixed criteria assertions (#1652)
1 parent 2f95d31 commit 7741f9d

186 files changed

Lines changed: 8511 additions & 7845 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,13 @@ evaluate_options:
8888
8989
default_test: file://./default-test.yaml
9090
91+
prompts:
92+
- "{{ input }}"
93+
9194
tests:
9295
- id: fizzbuzz
93-
input: Write FizzBuzz in Python. Use lowercase output strings "fizz", "buzz", and "fizzbuzz". Return only one Python code block.
96+
vars:
97+
input: Write FizzBuzz in Python. Use lowercase output strings "fizz", "buzz", and "fizzbuzz". Return only one Python code block.
9498
assert:
9599
- type: contains
96100
value: "fizz"
@@ -135,9 +139,13 @@ evaluate_options:
135139
default_test:
136140
threshold: 0.85
137141
142+
prompts:
143+
- "{{ input }}"
144+
138145
tests:
139146
- id: fizzbuzz
140-
input: Write FizzBuzz in Python
147+
vars:
148+
input: Write FizzBuzz in Python
141149
```
142150

143151
`target: local-openai` resolves the configured target id from `.agentv/config.yaml` and uses its provider, model, hooks, and provider settings. The object form above defines a full eval-local target and must include enough provider configuration to run. AgentV records the resolved target information in run artifacts so results can be audited and replayed. The `tags.experiment` label stays `with-skills` because the condition is unchanged; the model/provider variation belongs to the resolved target metadata.
@@ -214,10 +222,11 @@ const { results, summary } = await evaluate({
214222
experiment: 'with-skills',
215223
task: async (input) => runMyAppTarget(input),
216224
threshold: 0.8,
225+
prompts: ['{{ input }}'],
217226
tests: [
218227
{
219228
id: 'fizzbuzz',
220-
input: 'Write FizzBuzz in Python',
229+
vars: { input: 'Write FizzBuzz in Python' },
221230
assert: [
222231
{ type: 'contains', value: 'fizz' },
223232
'Implements correct FizzBuzz logic for multiples of 3, 5, and 15',
@@ -249,6 +258,7 @@ export default defineEval({
249258
earlyExit: false,
250259
},
251260
threshold: 0.8,
261+
prompts: ['{{ input }}'],
252262
workspace: {
253263
scope: 'attempt',
254264
repos: [
@@ -262,7 +272,7 @@ export default defineEval({
262272
tests: [
263273
{
264274
id: 'fizzbuzz',
265-
input: 'Write FizzBuzz in Python',
275+
vars: { input: 'Write FizzBuzz in Python' },
266276
assert: [
267277
{ type: 'contains', value: 'fizz' },
268278
'Implements correct FizzBuzz logic for multiples of 3, 5, and 15',

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -228,19 +228,22 @@ function buildPlannedResumeIdentityKeys(
228228
test.source?.evalFileAbsolutePath,
229229
]);
230230
const suites = Array.from(new Set<string | null>([test.suite ?? null, null]));
231+
const promptIds = Array.from(new Set<string | null>([test.prompt?.id ?? null, null]));
231232

232233
for (const evalPath of evalPaths) {
233234
for (const suite of suites) {
234-
keys.add(
235-
JSON.stringify({
236-
eval_path: evalPath,
237-
suite,
238-
test_id: test.id ?? 'unknown',
239-
prompt_id: test.prompt?.id ?? null,
240-
target: target ?? 'unknown',
241-
variant: variant ?? null,
242-
}),
243-
);
235+
for (const promptId of promptIds) {
236+
keys.add(
237+
JSON.stringify({
238+
eval_path: evalPath,
239+
suite,
240+
test_id: test.id ?? 'unknown',
241+
prompt_id: promptId,
242+
target: target ?? 'unknown',
243+
variant: variant ?? null,
244+
}),
245+
);
246+
}
244247
}
245248
}
246249

apps/cli/src/commands/eval/task-bundle.ts

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const TASK_EVAL_FILENAME = 'EVAL.yaml';
1919
const TASK_TARGETS_FILENAME = 'targets.yaml';
2020
const TASK_FILES_DIRNAME = 'files';
2121
const TASK_GRADERS_DIRNAME = 'graders';
22+
const INPUT_PROMPT = '{{ input }}';
2223
const BUNDLE_EVALS_DIRNAME = 'evals';
2324
const BUNDLE_MANIFEST_FILENAME = 'agentv_bundle.json';
2425
const BUNDLE_TARGETS_FILENAME = 'targets.yaml';
@@ -499,6 +500,20 @@ function withoutLegacyAssertionKeys(testCase: Record<string, unknown>): Record<s
499500
);
500501
}
501502

503+
function moveInputToVars(testCase: Record<string, unknown>): Record<string, unknown> {
504+
if (!Object.hasOwn(testCase, 'input')) {
505+
return testCase;
506+
}
507+
const { input, input_files: _inputFiles, ...caseWithoutInput } = testCase;
508+
return {
509+
...caseWithoutInput,
510+
vars: {
511+
...(isRecord(testCase.vars) ? testCase.vars : {}),
512+
input,
513+
},
514+
};
515+
}
516+
502517
function serializeGraderDefinition(
503518
definition: Record<string, unknown>,
504519
rewrites: ReadonlyMap<string, string>,
@@ -518,14 +533,14 @@ function buildEvalCase(
518533
const testCase = rewritePathsDeep(parseSourceTestCase(test), rewrites) as Record<string, unknown>;
519534
const graderDefinitions = test.source?.graderDefinitions ?? [];
520535
if (graderDefinitions.length > 0) {
521-
return {
536+
return moveInputToVars({
522537
...withoutLegacyAssertionKeys(testCase),
523538
assert: graderDefinitions.map((grader) =>
524539
serializeGraderDefinition(grader.definition, rewrites),
525540
),
526-
};
541+
});
527542
}
528-
return testCase;
543+
return moveInputToVars(testCase);
529544
}
530545

531546
function targetReferenceNames(target: TargetDefinition): readonly string[] {
@@ -789,7 +804,10 @@ function buildPortableEvalCase(
789804
): Record<string, unknown> {
790805
const testCase = buildEvalCase(test, rewrites);
791806
testCase.id = test.id;
792-
testCase.input = test.input.map((message) => serializeMessage(message, rewrites));
807+
testCase.vars = {
808+
...(isRecord(testCase.vars) ? testCase.vars : {}),
809+
input: test.input.map((message) => serializeMessage(message, rewrites)),
810+
};
793811

794812
if (test.criteria.trim().length > 0) {
795813
testCase.criteria = test.criteria;
@@ -1037,6 +1055,7 @@ export async function materializeTaskBundle(
10371055

10381056
await writeYamlFile(evalPath, {
10391057
target: options.targetName,
1058+
prompts: [INPUT_PROMPT],
10401059
tests: [evalCase],
10411060
});
10421061
await writeYamlFile(targetsPath, { targets: serializeTargetDefinitions(targetDefinitions) });
@@ -1107,6 +1126,7 @@ export async function materializeEvalBundle(
11071126

11081127
await writeYamlFile(evalPath, {
11091128
...(runtime ?? {}),
1129+
prompts: [INPUT_PROMPT],
11101130
tests: options.tests.map((test) => buildPortableEvalCase(test, rewrites)),
11111131
});
11121132
await writeYamlFile(targetsPath, {

apps/cli/src/commands/pipeline/input.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import { readFile } from 'node:fs/promises';
2222
import { mkdir, writeFile } from 'node:fs/promises';
2323
import { dirname, join, relative, resolve } from 'node:path';
2424

25-
import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core';
25+
import type { GraderConfig, LlmBackedGraderConfig, ScriptGraderConfig } from '@agentv/core';
2626

2727
/** Assertion types that can be graded deterministically without external scripts or LLMs. */
2828
const BUILTIN_ASSERTION_TYPES = new Set([
@@ -150,6 +150,7 @@ export const evalInputCommand = command({
150150
await writeJson(join(testDir, 'input.json'), {
151151
input: inputMessages,
152152
input_files: test.file_paths,
153+
...(test.description ? { description: test.description } : {}),
153154
metadata: test.metadata ?? {},
154155
});
155156

@@ -266,12 +267,12 @@ async function writeGraderConfigs(
266267
weight: config.weight ?? 1.0,
267268
config: config.config ?? {},
268269
});
269-
} else if (assertion.type === 'llm-grader') {
270+
} else if (assertion.type === 'llm-grader' || assertion.type === 'llm-rubric') {
270271
if (!hasLlmGraders) {
271272
await mkdir(llmGradersDir, { recursive: true });
272273
hasLlmGraders = true;
273274
}
274-
const config = assertion as LlmGraderConfig;
275+
const config = assertion as LlmBackedGraderConfig;
275276
let promptContent = '';
276277

277278
if (config.resolvedPromptPath) {
@@ -286,7 +287,7 @@ async function writeGraderConfigs(
286287

287288
// For rubrics assertions, include the criteria array directly
288289
// so grader subagents can evaluate without needing a prompt file.
289-
const rubrics = (config as LlmGraderConfig).rubrics;
290+
const rubrics = config.rubrics;
290291
const rubricsData = rubrics?.map((r) => ({
291292
id: r.id,
292293
outcome: r.outcome,
@@ -298,7 +299,11 @@ async function writeGraderConfigs(
298299

299300
await writeJson(join(llmGradersDir, `${config.name}.json`), {
300301
name: config.name,
302+
type: config.type,
301303
prompt_content: promptContent,
304+
...(config.type === 'llm-rubric' && config.value !== undefined
305+
? { value: config.value }
306+
: {}),
302307
...(rubricsData && rubricsData.length > 0 ? { rubrics: rubricsData } : {}),
303308
weight: config.weight ?? 1.0,
304309
threshold: 0.5,

apps/cli/src/commands/pipeline/run.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import { tmpdir } from 'node:os';
1818
import { dirname, join, relative, resolve } from 'node:path';
1919

2020
import { deriveCategory, loadTestSuite } from '@agentv/core';
21-
import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core';
21+
import type { GraderConfig, LlmBackedGraderConfig, ScriptGraderConfig } from '@agentv/core';
2222
import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts';
2323

2424
import { buildDefaultRunDir } from '../eval/result-layout.js';
@@ -174,6 +174,7 @@ export const evalRunCommand = command({
174174
await writeJson(join(testDir, 'input.json'), {
175175
input: inputMessages,
176176
input_files: test.file_paths,
177+
...(test.description ? { description: test.description } : {}),
177178
metadata: test.metadata ?? {},
178179
});
179180

@@ -463,12 +464,12 @@ async function writeGraderConfigs(
463464
weight: config.weight ?? 1.0,
464465
config: config.config ?? {},
465466
});
466-
} else if (assertion.type === 'llm-grader') {
467+
} else if (assertion.type === 'llm-grader' || assertion.type === 'llm-rubric') {
467468
if (!hasLlmGraders) {
468469
await mkdir(llmGradersDir, { recursive: true });
469470
hasLlmGraders = true;
470471
}
471-
const config = assertion as LlmGraderConfig;
472+
const config = assertion as LlmBackedGraderConfig;
472473
let promptContent = '';
473474
if (config.resolvedPromptPath) {
474475
try {
@@ -480,7 +481,7 @@ async function writeGraderConfigs(
480481
promptContent = config.prompt;
481482
}
482483
// For rubrics assertions, include the criteria array directly
483-
const rubrics = (config as LlmGraderConfig).rubrics;
484+
const rubrics = config.rubrics;
484485
const rubricsData = rubrics?.map((r) => ({
485486
id: r.id,
486487
outcome: r.outcome,
@@ -492,7 +493,11 @@ async function writeGraderConfigs(
492493

493494
await writeJson(join(llmGradersDir, `${config.name}.json`), {
494495
name: config.name,
496+
type: config.type,
495497
prompt_content: promptContent,
498+
...(config.type === 'llm-rubric' && config.value !== undefined
499+
? { value: config.value }
500+
: {}),
496501
...(rubricsData && rubricsData.length > 0 ? { rubrics: rubricsData } : {}),
497502
weight: config.weight ?? 1.0,
498503
threshold: 0.5,

apps/cli/test/commands/eval/artifact-writer.test.ts

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2286,7 +2286,15 @@ describe('writeArtifactsFromResults', () => {
22862286
const envFile = path.join(sourceRoot, '.env');
22872287
await writeFile(
22882288
evalFile,
2289-
['api_key: literal-secret', 'tests:', ' - id: trace-case', ' input: hello'].join('\n'),
2289+
[
2290+
'api_key: literal-secret',
2291+
'prompts:',
2292+
' - "{{ input }}"',
2293+
'tests:',
2294+
' - id: trace-case',
2295+
' vars:',
2296+
' input: hello',
2297+
].join('\n'),
22902298
);
22912299
await writeFile(inputFile, 'input fixture\n');
22922300
await writeFile(promptFile, 'grade this response\n');
@@ -2427,7 +2435,8 @@ describe('writeArtifactsFromResults', () => {
24272435
const [testCase] = parsedEval.tests as Record<string, unknown>[];
24282436
const [assertion] = testCase.assert as Record<string, unknown>[];
24292437
expect(parsedEval.target).toBe('gpt-4o');
2430-
expect(testCase.input).toBe('file://files/src/input.txt');
2438+
expect(parsedEval.prompts).toEqual(['{{ input }}']);
2439+
expect((testCase.vars as Record<string, unknown>).input).toBe('file://files/src/input.txt');
24312440
expect(assertion.prompt).toBe('file://graders/src/grader.md');
24322441
expect(assertion.prompt_script).toEqual([
24332442
'bun',
@@ -2450,7 +2459,17 @@ describe('writeArtifactsFromResults', () => {
24502459
await mkdir(path.dirname(evalFile), { recursive: true });
24512460
await writeFile(
24522461
evalFile,
2453-
['tests:', ' - id: alpha', ' input: A', ' - id: beta', ' input: B'].join('\n'),
2462+
[
2463+
'prompts:',
2464+
' - "{{ input }}"',
2465+
'tests:',
2466+
' - id: alpha',
2467+
' vars:',
2468+
' input: A',
2469+
' - id: beta',
2470+
' vars:',
2471+
' input: B',
2472+
].join('\n'),
24542473
);
24552474
const sourceTests = ['alpha', 'beta'].map(
24562475
(id) =>

apps/cli/test/commands/eval/bundle.test.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,10 @@ tests: ../data/cases.yaml
149149
template: 'workspaces/workspace-template',
150150
hooks: { before_each: { command: ['bun', 'scripts/scripts/setup.ts'] } },
151151
});
152-
const input = testCase.input as Array<{ content: Array<Record<string, unknown>> }>;
152+
expect(bundledEval.prompts).toEqual(['{{ input }}']);
153+
const input = (testCase.vars as Record<string, unknown>).input as Array<{
154+
content: Array<Record<string, unknown>>;
155+
}>;
153156
expect(input[0]?.content[0]).toEqual({ type: 'file', value: 'files/data/input.txt' });
154157

155158
const bundledTargets = await readFile(path.join(bundleDir, 'targets.yaml'), 'utf8');
@@ -181,12 +184,15 @@ tests: ../data/cases.yaml
181184
- id: candidate
182185
provider: mock
183186
response: '{"answer":"inline bundled response"}'
187+
prompts:
188+
- "{{ input }}"
184189
tests:
185190
- id: inline-case
186-
input: hello
187191
assert:
188192
- type: contains
189193
value: inline
194+
vars:
195+
input: hello
190196
`,
191197
'utf8',
192198
);
@@ -223,12 +229,15 @@ tests:
223229
path.join(sourceDir, 'evals', 'missing-template.eval.yaml'),
224230
`workspace:
225231
template: ../does-not-exist
232+
prompts:
233+
- "{{ input }}"
226234
tests:
227235
- id: missing-template
228-
input: hello
229236
assert:
230237
- type: contains
231238
value: Mock
239+
vars:
240+
input: hello
232241
`,
233242
'utf8',
234243
);
Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
name: builtin-test
2+
prompts:
3+
- "{{ input }}"
24
tests:
35
- id: test-01
4-
input: hello world
5-
criteria: Response echoes the input
66
assert:
7+
- metric: echoes_input
8+
type: llm-rubric
9+
value: Response echoes the input
710
- metric: has_hello
811
type: contains
912
value: hello
1013
- metric: matches_pattern
1114
type: regex
12-
value: "h[aeiou]llo"
15+
value: h[aeiou]llo
1316
- metric: is_valid_json
1417
type: is-json
18+
vars:
19+
input: hello world

0 commit comments

Comments
 (0)