Skip to content

Commit b14ea2c

Browse files
committed
feat(cli): simplify eval output surface
1 parent 35263cd commit b14ea2c

24 files changed

Lines changed: 383 additions & 103 deletions

File tree

.beads/issues.jsonl

Lines changed: 9 additions & 8 deletions
Large diffs are not rendered by default.

AGENTS.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,8 @@ Unit tests alone are insufficient for grader changes. After implementing or modi
389389
```bash
390390
# 1. Run the eval, writing results to a sibling *.results.jsonl file
391391
bun apps/cli/src/cli.ts eval examples/path/to/suite.eval.yaml --target azure \
392-
--out examples/path/to/suite.results.jsonl
392+
--output examples/path/to/suite.run \
393+
--export examples/path/to/suite.results.jsonl
393394

394395
# 2. Assert all expected score ranges pass
395396
bun scripts/check-grader-scores.ts
@@ -400,7 +401,7 @@ The script auto-discovers `examples/**/*.grader-scores.yaml`, locates the siblin
400401
**To add score checks for a new eval:**
401402
1. Create `<eval-stem>.grader-scores.yaml` next to the eval YAML.
402403
2. Add entries for each `(test_id, grader, range)` you care about — `grader` must match a `scores[].name` value in the JSONL output, and `range.min`/`range.max` default to 0/1 if omitted.
403-
3. Run the eval with `--out <eval-stem>.results.jsonl`, then run the script.
404+
3. Run the eval with `--output <eval-stem>.run --export <eval-stem>.results.jsonl`, then run the script.
404405

405406
See `examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml` for a concrete example.
406407

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ agentv compare .agentv/results/runs/<timestamp>/index.jsonl
7777
## Output formats
7878

7979
```bash
80-
agentv eval evals/my-eval.yaml # JSONL (default)
81-
agentv eval evals/my-eval.yaml -o report.html # HTML dashboard
82-
agentv eval evals/my-eval.yaml -o results.xml # JUnit XML for CI
80+
agentv eval evals/my-eval.yaml --output ./run # writes ./run/index.jsonl
81+
agentv eval evals/my-eval.yaml --export report.html
82+
agentv eval evals/my-eval.yaml --export results.xml # JUnit XML for CI
8383
```
8484

8585
## TypeScript SDK

apps/cli/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "agentv",
3-
"version": "4.32.0-next.1",
3+
"version": "4.32.0-next.2",
44
"description": "CLI entry point for AgentV",
55
"type": "module",
66
"repository": {

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,19 @@ export const evalRunCommand = command({
4646
out: option({
4747
type: optional(string),
4848
long: 'out',
49-
description: '[Deprecated: use --output] Write results to the specified path',
49+
description: '[Removed: use --output <dir> and --export <file>] Former flat result path',
5050
}),
5151
output: option({
5252
type: optional(string),
5353
long: 'output',
5454
short: 'o',
5555
description:
56-
'Artifact directory for run output (index.jsonl, benchmark.json, per-test grading/timing)',
56+
'Run artifact directory (writes index.jsonl, benchmark.json, timing, and per-test artifacts)',
5757
}),
5858
outputFormat: option({
5959
type: optional(string),
6060
long: 'output-format',
61-
description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
61+
description: '[Removed: use --export <file>] Run directories always write index.jsonl',
6262
}),
6363
experiment: option({
6464
type: optional(string),
@@ -188,8 +188,7 @@ export const evalRunCommand = command({
188188
artifacts: option({
189189
type: optional(string),
190190
long: 'artifacts',
191-
description:
192-
'[Deprecated: use --output] Write companion artifacts to the specified directory',
191+
description: '[Removed: use --output <dir>] Former companion artifact directory',
193192
}),
194193
graderTarget: option({
195194
type: optional(string),

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 77 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ interface NormalizedOptions {
8484
readonly targetsPath?: string;
8585
readonly filter?: string | readonly string[];
8686
readonly workers?: number;
87-
/** --output <dir>: artifact directory (new canonical meaning) */
87+
/** --output <dir>: canonical artifact directory */
8888
readonly outputDir?: string;
89-
/** Legacy --out <path>: deprecated, treated as artifact dir */
90-
readonly outPath?: string;
89+
/** Removed: use --output for run directories and --export for extra files */
90+
readonly removedOut?: string;
9191
/** --export <paths...>: additional output files */
9292
readonly exportPaths: readonly string[];
9393
readonly dryRun: boolean;
@@ -115,8 +115,10 @@ interface NormalizedOptions {
115115
readonly keepWorkspaces: boolean;
116116
/** Deprecated: benchmark.json is always written to artifact dir */
117117
readonly benchmarkJson?: string;
118-
/** Deprecated: use --output instead */
118+
/** Removed: use --output instead */
119119
readonly artifacts?: string;
120+
/** Removed: the run directory always uses index.jsonl */
121+
readonly outputFormat?: string;
120122
readonly graderTarget?: string;
121123
readonly model?: string;
122124
readonly outputMessages: number | 'all';
@@ -197,6 +199,43 @@ function normalizeFilter(value: unknown): string | readonly string[] | undefined
197199
return normalizeString(value);
198200
}
199201

202+
const LEGACY_OUTPUT_FILE_EXTENSIONS = new Set([
203+
'.jsonl',
204+
'.json',
205+
'.xml',
206+
'.yaml',
207+
'.yml',
208+
'.html',
209+
'.htm',
210+
]);
211+
212+
function looksLikeLegacyOutputFilePath(value: string): boolean {
213+
return LEGACY_OUTPUT_FILE_EXTENSIONS.has(path.extname(value).toLowerCase());
214+
}
215+
216+
function outputFileMigrationMessage(value: string): string {
217+
const ext = path.extname(value).toLowerCase();
218+
const exportHint =
219+
ext === '.xml'
220+
? `Use --export ${value} for JUnit XML.`
221+
: `Use --export ${value} if you still need that extra file.`;
222+
return `--output expects a run directory, not a file path: ${value}\n${exportHint} Set --output <dir> for the canonical run artifacts; AgentV always writes <dir>/index.jsonl.`;
223+
}
224+
225+
function artifactsMigrationMessage(artifactsDir: string, outputDir?: string): string {
226+
const lines = [`--artifacts was removed from agentv eval. Use --output ${artifactsDir} instead.`];
227+
if (outputDir && looksLikeLegacyOutputFilePath(outputDir)) {
228+
const ext = path.extname(outputDir).toLowerCase();
229+
lines.push(
230+
ext === '.xml'
231+
? `Use --export ${outputDir} for JUnit XML.`
232+
: `Use --export ${outputDir} if you still need that extra file.`,
233+
);
234+
lines.push(`Migration example: --output ${artifactsDir} --export ${outputDir}`);
235+
}
236+
return lines.join('\n');
237+
}
238+
200239
/**
201240
* Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
202241
*
@@ -286,7 +325,6 @@ function normalizeOptions(
286325
const configWorkers = config?.execution?.workers;
287326
const workers = cliWorkers ?? configWorkers ?? 0;
288327

289-
// --output is now a single optional string (artifact directory)
290328
const cliOutputDir = normalizeString(rawOptions.output);
291329

292330
// --export is the new repeatable flag for additional output files
@@ -324,9 +362,9 @@ function normalizeOptions(
324362
const configCacheEnabled = config?.cache?.enabled;
325363
const configCachePath = normalizeString(config?.cache?.path);
326364

327-
// Output dir: CLI --out > config output.dir > auto-generated
365+
// Output dir: CLI --output > config output.dir > auto-generated
328366
const cliOut = normalizeString(rawOptions.out);
329-
const configOut = config?.output?.dir;
367+
const configOutputDir = normalizeString(config?.output?.dir);
330368
const cliWorkspacePath = normalizeString(rawOptions.workspacePath);
331369
const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode);
332370
const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode);
@@ -346,8 +384,8 @@ function normalizeOptions(
346384
targetsPath: normalizeString(rawOptions.targets),
347385
filter: normalizeFilter(rawOptions.filter),
348386
workers: workers > 0 ? workers : undefined,
349-
outputDir: cliOutputDir,
350-
outPath: cliOut ?? configOut,
387+
outputDir: cliOutputDir ?? configOutputDir,
388+
removedOut: cliOut,
351389
exportPaths,
352390
dryRun: normalizeBoolean(rawOptions.dryRun),
353391
dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
@@ -395,6 +433,7 @@ function normalizeOptions(
395433
config?.execution?.keepWorkspaces === true,
396434
benchmarkJson: normalizeString(rawOptions.benchmarkJson),
397435
artifacts: normalizeString(rawOptions.artifacts),
436+
outputFormat: normalizeString(rawOptions.outputFormat),
398437
graderTarget: normalizeString(rawOptions.graderTarget),
399438
model: normalizeString(rawOptions.model),
400439
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
@@ -1044,6 +1083,27 @@ export async function runEvalCommand(
10441083
throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
10451084
}
10461085

1086+
if (options.removedOut) {
1087+
throw new Error(
1088+
[
1089+
'--out was removed from agentv eval. Use --output <dir> for the canonical run directory.',
1090+
'If you need an additional flat file, add --export <file>.',
1091+
`Migration example: --out ${options.removedOut} -> --output <dir> --export ${options.removedOut}`,
1092+
].join('\n'),
1093+
);
1094+
}
1095+
if (options.outputFormat) {
1096+
throw new Error(
1097+
'--output-format was removed from agentv eval. The run directory always writes index.jsonl; use --export <file> for JSON, XML/JUnit, YAML, or HTML copies.',
1098+
);
1099+
}
1100+
if (options.artifacts) {
1101+
throw new Error(artifactsMigrationMessage(options.artifacts, options.outputDir));
1102+
}
1103+
if (options.outputDir && looksLikeLegacyOutputFilePath(options.outputDir)) {
1104+
throw new Error(outputFileMigrationMessage(options.outputDir));
1105+
}
1106+
10471107
// --retry-errors: resume from a previous run by re-running execution_error and missing test cases.
10481108
// Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes
10491109
// both error cases and cases that never ran (e.g., due to a crash or interrupt).
@@ -1073,7 +1133,7 @@ export async function runEvalCommand(
10731133
// last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's
10741134
// `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default"
10751135
// convention. The cache pointer is written by saveRunCache after every eval.
1076-
if (options.resume && !options.retryErrors && !options.outputDir && !options.artifacts) {
1136+
if (options.resume && !options.retryErrors && !options.outputDir) {
10771137
const cachedDir = await resolveCachedRunDir(cwd);
10781138
if (cachedDir) {
10791139
options = { ...options, outputDir: cachedDir };
@@ -1088,7 +1148,7 @@ export async function runEvalCommand(
10881148
let resumeSkipKeys: Set<string> | undefined;
10891149
let isResumeAppend = false;
10901150
if (options.resume && !options.retryErrors) {
1091-
const explicitResumeDir = options.outputDir ?? options.artifacts;
1151+
const explicitResumeDir = options.outputDir;
10921152
if (explicitResumeDir) {
10931153
const resumeIndexPath = path.join(path.resolve(explicitResumeDir), 'index.jsonl');
10941154
if (existsSync(resumeIndexPath)) {
@@ -1138,50 +1198,27 @@ export async function runEvalCommand(
11381198
console.log(`Repository root: ${repoRoot}`);
11391199
}
11401200

1141-
// Emit deprecation warnings for legacy flags
1142-
if (options.outPath) {
1143-
console.warn('Warning: --out is deprecated. Use --output <dir> to set the artifact directory.');
1144-
}
1145-
if (options.artifacts) {
1146-
console.warn(
1147-
'Warning: --artifacts is deprecated. Use --output <dir> to set the artifact directory.',
1148-
);
1149-
}
1201+
// Emit deprecation warnings for remaining legacy flags.
11501202
if (options.benchmarkJson) {
11511203
console.warn(
11521204
'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
11531205
);
11541206
}
1155-
if (normalizeString(input.rawOptions.outputFormat)) {
1156-
console.warn(
1157-
'Warning: --output-format is deprecated. The artifact directory always uses JSONL.',
1158-
);
1159-
}
11601207

11611208
// Resolve artifact directory (runDir) and primary output path.
1162-
// Precedence: --output > --artifacts (deprecated) > --out (deprecated) > default
1163-
const explicitDir = options.outputDir ?? options.artifacts;
1209+
// Precedence: --output > config output.dir > default
1210+
const explicitDir = options.outputDir;
11641211
let runDir: string;
11651212
let outputPath: string;
1166-
let usesDefaultArtifactWorkspace: boolean;
11671213

11681214
if (explicitDir) {
1169-
// --output <dir> or --artifacts <dir>: use as artifact directory
11701215
runDir = path.resolve(explicitDir);
11711216
mkdirSync(runDir, { recursive: true });
11721217
outputPath = path.join(runDir, 'index.jsonl');
1173-
usesDefaultArtifactWorkspace = true;
1174-
} else if (options.outPath) {
1175-
// --out <path> (deprecated): use dirname as artifact dir
1176-
outputPath = path.resolve(options.outPath);
1177-
runDir = path.dirname(outputPath);
1178-
mkdirSync(runDir, { recursive: true });
1179-
usesDefaultArtifactWorkspace = false;
11801218
} else {
11811219
// Default: .agentv/results/runs/<experiment>/<timestamp>/
11821220
outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
11831221
runDir = path.dirname(outputPath);
1184-
usesDefaultArtifactWorkspace = true;
11851222
}
11861223

11871224
// Initialize OTel exporter if --export-otel flag is set or file export flags are used
@@ -1493,7 +1530,7 @@ export async function runEvalCommand(
14931530
// has execution_status: ok. The end-of-run write preserves this value via
14941531
// readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
14951532
// Skip on resume — we want to preserve the *original* planned count.
1496-
if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
1533+
if (!isResumeAppend && totalEvalCount > 0) {
14971534
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
14981535
await writeInitialBenchmarkArtifact(runDir, {
14991536
evalFile,
@@ -1659,7 +1696,7 @@ export async function runEvalCommand(
16591696

16601697
// When resuming, compute summary from ALL results (old + new, deduplicated)
16611698
let summaryResults = allResults;
1662-
if (isResumeAppend && usesDefaultArtifactWorkspace) {
1699+
if (isResumeAppend) {
16631700
const content = await readFile(outputPath, 'utf8');
16641701
summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
16651702
}
@@ -1687,7 +1724,7 @@ export async function runEvalCommand(
16871724
}
16881725

16891726
// Write artifacts to the run directory (always, not conditional on flags)
1690-
if (usesDefaultArtifactWorkspace && allResults.length > 0) {
1727+
if (allResults.length > 0) {
16911728
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
16921729
const sourceTests = activeTestFiles.flatMap(
16931730
(activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],

0 commit comments

Comments
 (0)