Skip to content

Commit 770bd0f

Browse files
christsoCopilotclaude
authored
feat(cli): add results report subcommand (#1105)
* feat(skill): add static html report export skill Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * feat(cli): add results report subcommand Move the static HTML report flow into a first-class results command that reads existing artifact workspaces, and remove the earlier skill-based implementation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * docs(results): add html report screenshots Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(results): simplify pass rate pill Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(results): widen detail pass rate pill Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * docs(results): move report screenshots to tools Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(results): simplify detail pass rate metric Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * docs(results): consolidate tools docs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(results): restore narrow pass rate pills Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(results): restore assertion badge styling Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * feat(results): simplify report UX aligned with gist reference - Merge separate "Passed/Failed Assertions" cards into a single unified list with inline checkmark/cross icons per assertion - Add "Criteria" column to test cases table, extracted from input prompt field for human-readable test descriptions - Remove redundant "Grader Results" table from detail panel (info already visible in per-grader score columns) - Collapse Input/Output into a toggleable details element so assertions are the primary expanded content - Add single-page mode for small result sets (<=20 tests) that renders summary + test cases together without tab navigation - Add light/dark theme toggle with full CSS custom property support Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(results): address lint issues in report UX commit - Sort imports alphabetically in report.ts - Fix biome formatting (long line wrapping) - Replace non-null assertion with type assertion in test Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * style(results): apply biome formatting to report template Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs(results): update report screenshots and docs for simplified UX Replace screenshots with realistic data showing the new single-page layout, unified assertions, criteria column, and collapsible I/O. Update feature descriptions to match the current report template. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(results): fix chip vertical stretch and retake doc screenshots Add align-items: center to .header-meta so chips don't stretch to match the theme toggle button height. Retake details screenshot with All status filter so pass rate (83.3%) is consistent with the overview. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(results): show grader name on badges and add criteria fallback Assertion badges now prefer the assertion's own type (e.g. "contains", "regex") over the parent score name ("deterministic"). When input has no prompt, criteria falls back to concatenated grader names + first assertion text, truncated at 120 chars. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e07634d commit 770bd0f

8 files changed

Lines changed: 451 additions & 3 deletions

File tree

apps/cli/src/commands/results/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { subcommands } from 'cmd-ts';
22

33
import { resultsExportCommand } from './export.js';
44
import { resultsFailuresCommand } from './failures.js';
5+
import { resultsReportCommand } from './report.js';
56
import { resultsShowCommand } from './show.js';
67
import { resultsSummaryCommand } from './summary.js';
78
import { resultsValidateCommand } from './validate.js';
@@ -11,6 +12,7 @@ export const resultsCommand = subcommands({
1112
description: 'Inspect, export, and manage evaluation results',
1213
cmds: {
1314
export: resultsExportCommand,
15+
report: resultsReportCommand,
1416
summary: resultsSummaryCommand,
1517
failures: resultsFailuresCommand,
1618
show: resultsShowCommand,

apps/cli/src/commands/results/report-template.ts

Lines changed: 2 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2+
import path from 'node:path';
3+
4+
import { command, option, optional, string } from 'cmd-ts';
5+
6+
import type { EvaluationResult } from '@agentv/core';
7+
8+
import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js';
9+
import { RESULTS_REPORT_TEMPLATE } from './report-template.js';
10+
import { resolveSourceFile, sourceArg } from './shared.js';
11+
12+
interface ReportManifestRecord {
13+
readonly eval_file?: string;
14+
}
15+
16+
interface BenchmarkMetadata {
17+
readonly metadata?: {
18+
readonly eval_file?: string;
19+
};
20+
}
21+
22+
function normalizeEvalFileLabel(value: string | undefined): string | undefined {
23+
const trimmed = value?.trim();
24+
if (!trimmed) {
25+
return undefined;
26+
}
27+
28+
return path
29+
.basename(trimmed)
30+
.replace(/\.results\.jsonl$/i, '')
31+
.replace(/\.eval\.ya?ml$/i, '')
32+
.replace(/\.ya?ml$/i, '')
33+
.replace(/\.jsonl$/i, '');
34+
}
35+
36+
function readBenchmarkEvalFile(sourceFile: string): string | undefined {
37+
const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json');
38+
if (!existsSync(benchmarkPath)) {
39+
return undefined;
40+
}
41+
42+
try {
43+
const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata;
44+
return normalizeEvalFileLabel(benchmark.metadata?.eval_file);
45+
} catch {
46+
return undefined;
47+
}
48+
}
49+
50+
export function deriveReportPath(sourceFile: string): string {
51+
return path.join(path.dirname(sourceFile), 'report.html');
52+
}
53+
54+
function serializeReportResult(
55+
result: EvaluationResult,
56+
sourceFile: string,
57+
manifestRecord?: ReportManifestRecord,
58+
benchmarkEvalFile?: string,
59+
): Record<string, unknown> {
60+
const fallbackEvalFile =
61+
normalizeEvalFileLabel(manifestRecord?.eval_file) ??
62+
benchmarkEvalFile ??
63+
normalizeEvalFileLabel(result.suite) ??
64+
path.basename(path.dirname(sourceFile));
65+
66+
return {
67+
timestamp: result.timestamp,
68+
test_id: result.testId,
69+
suite: result.suite,
70+
category: result.category,
71+
target: result.target,
72+
score: result.score,
73+
scores: result.scores,
74+
execution_status: result.executionStatus,
75+
error: result.error,
76+
duration_ms: result.durationMs,
77+
token_usage: result.tokenUsage,
78+
cost_usd: result.costUsd,
79+
input: result.input,
80+
output: result.output,
81+
assertions: result.assertions,
82+
eval_file: fallbackEvalFile,
83+
};
84+
}
85+
86+
export async function loadReportSource(
87+
source: string | undefined,
88+
cwd: string,
89+
): Promise<{
90+
sourceFile: string;
91+
results: EvaluationResult[];
92+
records: readonly ReportManifestRecord[];
93+
benchmarkEvalFile?: string;
94+
}> {
95+
const { sourceFile } = await resolveSourceFile(source, cwd);
96+
const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
97+
const content = readFileSync(resolvedSourceFile, 'utf8');
98+
const records = parseResultManifest(content) as ReportManifestRecord[];
99+
const results = loadManifestResults(resolvedSourceFile);
100+
101+
if (results.length === 0) {
102+
throw new Error(`No results found in ${resolvedSourceFile}`);
103+
}
104+
105+
return {
106+
sourceFile: resolvedSourceFile,
107+
results,
108+
records,
109+
benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile),
110+
};
111+
}
112+
113+
export function renderResultsReport(
114+
results: readonly EvaluationResult[],
115+
sourceFile: string,
116+
records: readonly ReportManifestRecord[],
117+
benchmarkEvalFile?: string,
118+
): string {
119+
if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) {
120+
throw new Error('Report template is missing __DATA_PLACEHOLDER__');
121+
}
122+
123+
const rows = results.map((result, index) =>
124+
serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile),
125+
);
126+
const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/');
127+
return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', dataJson);
128+
}
129+
130+
export async function writeResultsReport(
131+
source: string | undefined,
132+
outputPath: string | undefined,
133+
cwd: string,
134+
): Promise<{ sourceFile: string; outputPath: string; html: string }> {
135+
const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
136+
const resolvedOutputPath = outputPath
137+
? path.isAbsolute(outputPath)
138+
? outputPath
139+
: path.resolve(cwd, outputPath)
140+
: deriveReportPath(sourceFile);
141+
const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile);
142+
143+
mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
144+
writeFileSync(resolvedOutputPath, html, 'utf8');
145+
146+
const written = readFileSync(resolvedOutputPath, 'utf8');
147+
if (written.includes('__DATA_PLACEHOLDER__')) {
148+
throw new Error('Report placeholder substitution failed');
149+
}
150+
151+
return { sourceFile, outputPath: resolvedOutputPath, html: written };
152+
}
153+
154+
export const resultsReportCommand = command({
155+
name: 'report',
156+
description: 'Generate a static HTML report from a run workspace or index.jsonl manifest',
157+
args: {
158+
source: sourceArg,
159+
out: option({
160+
type: optional(string),
161+
long: 'out',
162+
short: 'o',
163+
description: 'Output HTML file (defaults to <run-dir>/report.html)',
164+
}),
165+
dir: option({
166+
type: optional(string),
167+
long: 'dir',
168+
short: 'd',
169+
description: 'Working directory (default: current directory)',
170+
}),
171+
},
172+
handler: async ({ source, out, dir }) => {
173+
const cwd = dir ?? process.cwd();
174+
175+
try {
176+
const { sourceFile, outputPath } = await writeResultsReport(source, out, cwd);
177+
console.log(`Report written to ${outputPath}`);
178+
console.log(`Source: ${sourceFile}`);
179+
} catch (error) {
180+
console.error(`Error: ${(error as Error).message}`);
181+
process.exit(1);
182+
}
183+
},
184+
});
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
2+
import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
3+
import { tmpdir } from 'node:os';
4+
import path from 'node:path';
5+
import vm from 'node:vm';
6+
7+
import type { EvaluationResult, EvaluatorResult } from '@agentv/core';
8+
9+
import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js';
10+
import {
11+
deriveReportPath,
12+
loadReportSource,
13+
writeResultsReport,
14+
} from '../../../src/commands/results/report.js';
15+
16+
function makeScore(
17+
name: string,
18+
type: string,
19+
score: number,
20+
assertions: EvaluatorResult['assertions'],
21+
): EvaluatorResult {
22+
return {
23+
name,
24+
type,
25+
score,
26+
assertions,
27+
verdict: score >= 0.5 ? 'pass' : 'fail',
28+
};
29+
}
30+
31+
function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
32+
return {
33+
timestamp: '2026-04-15T01:00:00.000Z',
34+
testId: 'test-1',
35+
suite: 'default',
36+
score: 1,
37+
assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }],
38+
output: [{ role: 'assistant', content: 'answer' }],
39+
input: [{ role: 'user', content: 'question' }],
40+
target: 'default',
41+
executionStatus: 'ok',
42+
tokenUsage: { input: 100, output: 50 },
43+
durationMs: 1200,
44+
...overrides,
45+
};
46+
}
47+
48+
describe('results report', () => {
49+
let tempDir: string;
50+
51+
beforeEach(() => {
52+
tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-report-test-'));
53+
});
54+
55+
afterEach(() => {
56+
rmSync(tempDir, { recursive: true, force: true });
57+
});
58+
59+
it('derives default report path from the run workspace', () => {
60+
const sourceFile = path.join(tempDir, 'run', 'index.jsonl');
61+
expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html'));
62+
});
63+
64+
it('loads benchmark eval file metadata from a run workspace', async () => {
65+
const runDir = path.join(tempDir, 'run');
66+
await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
67+
68+
const loaded = await loadReportSource(runDir, tempDir);
69+
70+
expect(loaded.results).toHaveLength(1);
71+
expect(loaded.benchmarkEvalFile).toBe('demo');
72+
});
73+
74+
it('writes a static HTML report with grouped eval files and assertion type badges', async () => {
75+
const runDir = path.join(tempDir, 'run');
76+
await writeArtifactsFromResults(
77+
[
78+
makeResult({
79+
testId: 'registry-pass',
80+
target: 'claude-sonnet',
81+
scores: [
82+
makeScore('contains', 'contains', 1, [
83+
{ text: 'mentions registry', passed: true, evidence: 'registry present' },
84+
]),
85+
],
86+
}),
87+
makeResult({
88+
testId: 'billing-fail',
89+
target: 'gpt-5.4',
90+
score: 0.2,
91+
executionStatus: 'quality_failure',
92+
scores: [
93+
makeScore('regex', 'regex', 0.2, [
94+
{ text: 'matches invoice pattern', passed: false, evidence: 'no invoice id' },
95+
]),
96+
],
97+
}),
98+
],
99+
runDir,
100+
{ evalFile: 'evals/demo.eval.yaml' },
101+
);
102+
103+
const indexPath = path.join(runDir, 'index.jsonl');
104+
const lines = readFileSync(indexPath, 'utf8')
105+
.trim()
106+
.split('\n')
107+
.map((line) => JSON.parse(line) as Record<string, unknown>);
108+
lines[0].eval_file = 'cw-freight-boolean-registry';
109+
lines[1].eval_file = 'cw-freight-billing';
110+
writeFileSync(indexPath, `${lines.map((line) => JSON.stringify(line)).join('\n')}\n`, 'utf8');
111+
112+
const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
113+
const html = readFileSync(outputPath, 'utf8');
114+
115+
expect(outputPath).toBe(path.join(runDir, 'report.html'));
116+
expect(html).not.toContain('__DATA_PLACEHOLDER__');
117+
expect(html).toContain('#030712');
118+
expect(html).toContain('cw-freight-boolean-registry');
119+
expect(html).toContain('cw-freight-billing');
120+
expect(html).toContain('contains');
121+
expect(html).toContain('regex');
122+
expect(html).toContain('AgentV Evaluation Report');
123+
expect(html).not.toContain('<th>Progress</th>');
124+
expect(html).not.toContain('metric-stack');
125+
expect(html).toContain('<span class="pass-rate-track">');
126+
expect(html).toContain('<span class="pass-rate-label">${formatPercent(rate)}</span>');
127+
expect(html).toContain(
128+
'<span class="metric-value">${escapeHtml(formatPercent(group.stats.pass_rate))}</span>',
129+
);
130+
expect(html).toContain('Assertions');
131+
expect(html).toContain('assertion-badge');
132+
expect(html).not.toContain('Grader Results');
133+
expect(html).not.toContain('Evaluator Results');
134+
});
135+
136+
it('emits an inline report script that parses successfully', async () => {
137+
const runDir = path.join(tempDir, 'run');
138+
await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
139+
140+
const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
141+
const html = readFileSync(outputPath, 'utf8');
142+
const script = html.match(/<script>([\s\S]*)<\/script>/)?.[1];
143+
144+
expect(script).toBeString();
145+
146+
const app = { innerHTML: '' };
147+
const headerMeta = { innerHTML: '' };
148+
const tabNav = { classList: { add: () => undefined, remove: () => undefined } };
149+
const tabButton = {
150+
getAttribute: () => 'overview',
151+
classList: { toggle: () => undefined },
152+
addEventListener: () => undefined,
153+
};
154+
155+
expect(() =>
156+
vm.runInNewContext(script as string, {
157+
console,
158+
document: {
159+
documentElement: { classList: { contains: () => false, toggle: () => undefined } },
160+
getElementById(id: string) {
161+
if (id === 'app') return app;
162+
if (id === 'header-meta') return headerMeta;
163+
if (id === 'tab-nav') return tabNav;
164+
if (id === 'theme-btn') return { addEventListener: () => undefined };
165+
return null;
166+
},
167+
querySelectorAll(selector: string) {
168+
return selector === '.tab' ? [tabButton] : [];
169+
},
170+
},
171+
}),
172+
).not.toThrow();
173+
});
174+
});
95.9 KB
Loading
91.2 KB
Loading

0 commit comments

Comments
 (0)