Skip to content

Commit 14352fd

Browse files
committed
feat(cli): simplify eval output surface
1 parent f116231 commit 14352fd

24 files changed

Lines changed: 376 additions & 96 deletions

File tree

.beads/issues.jsonl

Lines changed: 2 additions & 1 deletion
Large diffs are not rendered by default.

AGENTS.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,8 @@ Unit tests alone are insufficient for grader changes. After implementing or modi
389389
```bash
390390
# 1. Run the eval, writing results to a sibling *.results.jsonl file
391391
bun apps/cli/src/cli.ts eval examples/path/to/suite.eval.yaml --target azure \
392-
--out examples/path/to/suite.results.jsonl
392+
--output examples/path/to/suite.run \
393+
--export examples/path/to/suite.results.jsonl
393394

394395
# 2. Assert all expected score ranges pass
395396
bun scripts/check-grader-scores.ts
@@ -400,7 +401,7 @@ The script auto-discovers `examples/**/*.grader-scores.yaml`, locates the siblin
400401
**To add score checks for a new eval:**
401402
1. Create `<eval-stem>.grader-scores.yaml` next to the eval YAML.
402403
2. Add entries for each `(test_id, grader, range)` you care about — `grader` must match a `scores[].name` value in the JSONL output, and `range.min`/`range.max` default to 0/1 if omitted.
403-
3. Run the eval with `--out <eval-stem>.results.jsonl`, then run the script.
404+
3. Run the eval with `--output <eval-stem>.run --export <eval-stem>.results.jsonl`, then run the script.
404405

405406
See `examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml` for a concrete example.
406407

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ agentv compare .agentv/results/runs/<timestamp>/index.jsonl
7777
## Output formats
7878

7979
```bash
80-
agentv eval evals/my-eval.yaml # JSONL (default)
81-
agentv eval evals/my-eval.yaml -o report.html # HTML dashboard
82-
agentv eval evals/my-eval.yaml -o results.xml # JUnit XML for CI
80+
agentv eval evals/my-eval.yaml --output ./run # writes ./run/index.jsonl
81+
agentv eval evals/my-eval.yaml --export report.html
82+
agentv eval evals/my-eval.yaml --export results.xml # JUnit XML for CI
8383
```
8484

8585
## TypeScript SDK

apps/cli/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "agentv",
3-
"version": "4.32.0-next.1",
3+
"version": "4.32.0-next.2",
44
"description": "CLI entry point for AgentV",
55
"type": "module",
66
"repository": {

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,19 @@ export const evalRunCommand = command({
4646
out: option({
4747
type: optional(string),
4848
long: 'out',
49-
description: '[Deprecated: use --output] Write results to the specified path',
49+
description: '[Removed: use --output <dir> and --export <file>] Former flat result path',
5050
}),
5151
output: option({
5252
type: optional(string),
5353
long: 'output',
5454
short: 'o',
5555
description:
56-
'Artifact directory for run output (index.jsonl, benchmark.json, per-test grading/timing)',
56+
'Run artifact directory (writes index.jsonl, benchmark.json, timing, and per-test artifacts)',
5757
}),
5858
outputFormat: option({
5959
type: optional(string),
6060
long: 'output-format',
61-
description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
61+
description: '[Removed: use --export <file>] Run directories always write index.jsonl',
6262
}),
6363
experiment: option({
6464
type: optional(string),
@@ -188,8 +188,7 @@ export const evalRunCommand = command({
188188
artifacts: option({
189189
type: optional(string),
190190
long: 'artifacts',
191-
description:
192-
'[Deprecated: use --output] Write companion artifacts to the specified directory',
191+
description: '[Removed: use --output <dir>] Former companion artifact directory',
193192
}),
194193
graderTarget: option({
195194
type: optional(string),

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 77 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ interface NormalizedOptions {
8484
readonly targetsPath?: string;
8585
readonly filter?: string | readonly string[];
8686
readonly workers?: number;
87-
/** --output <dir>: artifact directory (new canonical meaning) */
87+
/** --output <dir>: canonical artifact directory */
8888
readonly outputDir?: string;
89-
/** Legacy --out <path>: deprecated, treated as artifact dir */
90-
readonly outPath?: string;
89+
/** Removed: use --output for run directories and --export for extra files */
90+
readonly removedOut?: string;
9191
/** --export <paths...>: additional output files */
9292
readonly exportPaths: readonly string[];
9393
readonly dryRun: boolean;
@@ -115,8 +115,10 @@ interface NormalizedOptions {
115115
readonly keepWorkspaces: boolean;
116116
/** Deprecated: benchmark.json is always written to artifact dir */
117117
readonly benchmarkJson?: string;
118-
/** Deprecated: use --output instead */
118+
/** Removed: use --output instead */
119119
readonly artifacts?: string;
120+
/** Removed: the run directory always uses index.jsonl */
121+
readonly outputFormat?: string;
120122
readonly graderTarget?: string;
121123
readonly model?: string;
122124
readonly outputMessages: number | 'all';
@@ -227,6 +229,43 @@ function normalizeSourceMetadataByEvalFile(
227229
return undefined;
228230
}
229231

232+
const LEGACY_OUTPUT_FILE_EXTENSIONS = new Set([
233+
'.jsonl',
234+
'.json',
235+
'.xml',
236+
'.yaml',
237+
'.yml',
238+
'.html',
239+
'.htm',
240+
]);
241+
242+
function looksLikeLegacyOutputFilePath(value: string): boolean {
243+
return LEGACY_OUTPUT_FILE_EXTENSIONS.has(path.extname(value).toLowerCase());
244+
}
245+
246+
function outputFileMigrationMessage(value: string): string {
247+
const ext = path.extname(value).toLowerCase();
248+
const exportHint =
249+
ext === '.xml'
250+
? `Use --export ${value} for JUnit XML.`
251+
: `Use --export ${value} if you still need that extra file.`;
252+
return `--output expects a run directory, not a file path: ${value}\n${exportHint} Set --output <dir> for the canonical run artifacts; AgentV always writes <dir>/index.jsonl.`;
253+
}
254+
255+
function artifactsMigrationMessage(artifactsDir: string, outputDir?: string): string {
256+
const lines = [`--artifacts was removed from agentv eval. Use --output ${artifactsDir} instead.`];
257+
if (outputDir && looksLikeLegacyOutputFilePath(outputDir)) {
258+
const ext = path.extname(outputDir).toLowerCase();
259+
lines.push(
260+
ext === '.xml'
261+
? `Use --export ${outputDir} for JUnit XML.`
262+
: `Use --export ${outputDir} if you still need that extra file.`,
263+
);
264+
lines.push(`Migration example: --output ${artifactsDir} --export ${outputDir}`);
265+
}
266+
return lines.join('\n');
267+
}
268+
230269
/**
231270
* Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
232271
*
@@ -316,7 +355,6 @@ function normalizeOptions(
316355
const configWorkers = config?.execution?.workers;
317356
const workers = cliWorkers ?? configWorkers ?? 0;
318357

319-
// --output is now a single optional string (artifact directory)
320358
const cliOutputDir = normalizeString(rawOptions.output);
321359

322360
// --export is the new repeatable flag for additional output files
@@ -354,9 +392,9 @@ function normalizeOptions(
354392
const configCacheEnabled = config?.cache?.enabled;
355393
const configCachePath = normalizeString(config?.cache?.path);
356394

357-
// Output dir: CLI --out > config output.dir > auto-generated
395+
// Output dir: CLI --output > config output.dir > auto-generated
358396
const cliOut = normalizeString(rawOptions.out);
359-
const configOut = config?.output?.dir;
397+
const configOutputDir = normalizeString(config?.output?.dir);
360398
const cliWorkspacePath = normalizeString(rawOptions.workspacePath);
361399
const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode);
362400
const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode);
@@ -376,8 +414,8 @@ function normalizeOptions(
376414
targetsPath: normalizeString(rawOptions.targets),
377415
filter: normalizeFilter(rawOptions.filter),
378416
workers: workers > 0 ? workers : undefined,
379-
outputDir: cliOutputDir,
380-
outPath: cliOut ?? configOut,
417+
outputDir: cliOutputDir ?? configOutputDir,
418+
removedOut: cliOut,
381419
exportPaths,
382420
dryRun: normalizeBoolean(rawOptions.dryRun),
383421
dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
@@ -425,6 +463,7 @@ function normalizeOptions(
425463
config?.execution?.keepWorkspaces === true,
426464
benchmarkJson: normalizeString(rawOptions.benchmarkJson),
427465
artifacts: normalizeString(rawOptions.artifacts),
466+
outputFormat: normalizeString(rawOptions.outputFormat),
428467
graderTarget: normalizeString(rawOptions.graderTarget),
429468
model: normalizeString(rawOptions.model),
430469
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
@@ -1096,6 +1135,27 @@ export async function runEvalCommand(
10961135
throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
10971136
}
10981137

1138+
if (options.removedOut) {
1139+
throw new Error(
1140+
[
1141+
'--out was removed from agentv eval. Use --output <dir> for the canonical run directory.',
1142+
'If you need an additional flat file, add --export <file>.',
1143+
`Migration example: --out ${options.removedOut} -> --output <dir> --export ${options.removedOut}`,
1144+
].join('\n'),
1145+
);
1146+
}
1147+
if (options.outputFormat) {
1148+
throw new Error(
1149+
'--output-format was removed from agentv eval. The run directory always writes index.jsonl; use --export <file> for JSON, XML/JUnit, YAML, or HTML copies.',
1150+
);
1151+
}
1152+
if (options.artifacts) {
1153+
throw new Error(artifactsMigrationMessage(options.artifacts, options.outputDir));
1154+
}
1155+
if (options.outputDir && looksLikeLegacyOutputFilePath(options.outputDir)) {
1156+
throw new Error(outputFileMigrationMessage(options.outputDir));
1157+
}
1158+
10991159
// --retry-errors: resume from a previous run by re-running execution_error and missing test cases.
11001160
// Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes
11011161
// both error cases and cases that never ran (e.g., due to a crash or interrupt).
@@ -1125,7 +1185,7 @@ export async function runEvalCommand(
11251185
// last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's
11261186
// `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default"
11271187
// convention. The cache pointer is written by saveRunCache after every eval.
1128-
if (options.resume && !options.retryErrors && !options.outputDir && !options.artifacts) {
1188+
if (options.resume && !options.retryErrors && !options.outputDir) {
11291189
const cachedDir = await resolveCachedRunDir(cwd);
11301190
if (cachedDir) {
11311191
options = { ...options, outputDir: cachedDir };
@@ -1140,7 +1200,7 @@ export async function runEvalCommand(
11401200
let resumeSkipKeys: Set<string> | undefined;
11411201
let isResumeAppend = false;
11421202
if (options.resume && !options.retryErrors) {
1143-
const explicitResumeDir = options.outputDir ?? options.artifacts;
1203+
const explicitResumeDir = options.outputDir;
11441204
if (explicitResumeDir) {
11451205
const resumeIndexPath = path.join(path.resolve(explicitResumeDir), 'index.jsonl');
11461206
if (existsSync(resumeIndexPath)) {
@@ -1190,50 +1250,27 @@ export async function runEvalCommand(
11901250
console.log(`Repository root: ${repoRoot}`);
11911251
}
11921252

1193-
// Emit deprecation warnings for legacy flags
1194-
if (options.outPath) {
1195-
console.warn('Warning: --out is deprecated. Use --output <dir> to set the artifact directory.');
1196-
}
1197-
if (options.artifacts) {
1198-
console.warn(
1199-
'Warning: --artifacts is deprecated. Use --output <dir> to set the artifact directory.',
1200-
);
1201-
}
1253+
// Emit deprecation warnings for remaining legacy flags.
12021254
if (options.benchmarkJson) {
12031255
console.warn(
12041256
'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
12051257
);
12061258
}
1207-
if (normalizeString(input.rawOptions.outputFormat)) {
1208-
console.warn(
1209-
'Warning: --output-format is deprecated. The artifact directory always uses JSONL.',
1210-
);
1211-
}
12121259

12131260
// Resolve artifact directory (runDir) and primary output path.
1214-
// Precedence: --output > --artifacts (deprecated) > --out (deprecated) > default
1215-
const explicitDir = options.outputDir ?? options.artifacts;
1261+
// Precedence: --output > config output.dir > default
1262+
const explicitDir = options.outputDir;
12161263
let runDir: string;
12171264
let outputPath: string;
1218-
let usesDefaultArtifactWorkspace: boolean;
12191265

12201266
if (explicitDir) {
1221-
// --output <dir> or --artifacts <dir>: use as artifact directory
12221267
runDir = path.resolve(explicitDir);
12231268
mkdirSync(runDir, { recursive: true });
12241269
outputPath = path.join(runDir, 'index.jsonl');
1225-
usesDefaultArtifactWorkspace = true;
1226-
} else if (options.outPath) {
1227-
// --out <path> (deprecated): use dirname as artifact dir
1228-
outputPath = path.resolve(options.outPath);
1229-
runDir = path.dirname(outputPath);
1230-
mkdirSync(runDir, { recursive: true });
1231-
usesDefaultArtifactWorkspace = false;
12321270
} else {
12331271
// Default: .agentv/results/runs/<experiment>/<timestamp>/
12341272
outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
12351273
runDir = path.dirname(outputPath);
1236-
usesDefaultArtifactWorkspace = true;
12371274
}
12381275

12391276
// Initialize OTel exporter if --export-otel flag is set or file export flags are used
@@ -1545,7 +1582,7 @@ export async function runEvalCommand(
15451582
// has execution_status: ok. The end-of-run write preserves this value via
15461583
// readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
15471584
// Skip on resume — we want to preserve the *original* planned count.
1548-
if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
1585+
if (!isResumeAppend && totalEvalCount > 0) {
15491586
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
15501587
await writeInitialBenchmarkArtifact(runDir, {
15511588
evalFile,
@@ -1719,7 +1756,7 @@ export async function runEvalCommand(
17191756

17201757
// When resuming, compute summary from ALL results (old + new, deduplicated)
17211758
let summaryResults = allResults;
1722-
if (isResumeAppend && usesDefaultArtifactWorkspace) {
1759+
if (isResumeAppend) {
17231760
const content = await readFile(outputPath, 'utf8');
17241761
summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
17251762
}
@@ -1747,7 +1784,7 @@ export async function runEvalCommand(
17471784
}
17481785

17491786
// Write artifacts to the run directory (always, not conditional on flags)
1750-
if (usesDefaultArtifactWorkspace && allResults.length > 0) {
1787+
if (allResults.length > 0) {
17511788
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
17521789
const sourceTests = activeTestFiles.flatMap(
17531790
(activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],

0 commit comments

Comments
 (0)