Skip to content

Commit e700c35

Browse files
authored
feat(scripts): deploy example eval results repos (#1267)
* feat(scripts): deploy example eval results repos * fix(studio): clarify synced remote runs * fix(studio): dedupe synced runs in detail views * fix(results): store remote runs in agentv results layout
1 parent f7a3b16 commit e700c35

13 files changed

Lines changed: 721 additions & 27 deletions

File tree

apps/cli/src/commands/results/remote.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,10 @@ export async function ensureRemoteRunAvailable(
290290
throw new Error(`Remote manifest path is outside the results repo clone: ${meta.path}`);
291291
}
292292

293-
const relativeRunPath = path.posix.relative('runs', path.posix.dirname(relativeManifestPath));
293+
const relativeRunPath = path.posix.relative(
294+
'.agentv/results/runs',
295+
path.posix.dirname(relativeManifestPath),
296+
);
294297
await materializeGitRun(config.path, relativeRunPath);
295298
}
296299

apps/cli/src/commands/results/serve.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,12 +376,14 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
376376
let target: string | undefined;
377377
let experiment = inferExperimentFromRunId(m.raw_filename);
378378
let passRate = m.passRate;
379+
let avgScore = m.avgScore;
379380
try {
380381
const records = await loadLightweightResultsForMeta(searchDir, m);
381382
if (records.length > 0) {
382383
target = records[0].target;
383384
experiment = records[0].experiment ?? experiment;
384385
passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
386+
avgScore = records.reduce((sum, r) => sum + r.score, 0) / records.length;
385387
} else {
386388
// Run is in-progress with 0 results written yet — fall back to the
387389
// in-memory target stored when the Studio launched this run.
@@ -402,7 +404,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
402404
timestamp: m.timestamp,
403405
test_count: m.testCount,
404406
pass_rate: passRate,
405-
avg_score: m.avgScore,
407+
avg_score: avgScore,
406408
size_bytes: m.sizeBytes,
407409
source: m.source,
408410
...(target && { target }),

apps/cli/test/commands/results/serve.test.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ function writeRemoteRunArtifact(
107107
/^(\d{4}-\d{2}-\d{2})T(\d{2})-(\d{2})-(\d{2})-(\d{3})Z$/,
108108
'$1T$2:$3:$4.$5Z',
109109
);
110-
const runDir = path.join(cloneDir, 'runs', experiment, timestamp);
110+
const runDir = path.join(cloneDir, '.agentv', 'results', 'runs', experiment, timestamp);
111111
mkdirSync(runDir, { recursive: true });
112112
writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultRecord));
113113
writeFileSync(
@@ -653,6 +653,8 @@ describe('serve app', () => {
653653
process.env.AGENTV_HOME,
654654
'results',
655655
'EntityProcess-agentv-evals',
656+
'.agentv',
657+
'results',
656658
'runs',
657659
'default',
658660
'2026-03-26T10-00-00-000Z',
@@ -705,14 +707,21 @@ describe('serve app', () => {
705707
const listRes = await app.request('/api/runs');
706708
expect(listRes.status).toBe(200);
707709
const listData = (await listRes.json()) as {
708-
runs: Array<{ filename: string; source: string; experiment?: string; pass_rate?: number }>;
710+
runs: Array<{
711+
filename: string;
712+
source: string;
713+
experiment?: string;
714+
pass_rate?: number;
715+
avg_score?: number;
716+
}>;
709717
};
710718
expect(listData.runs).toHaveLength(1);
711719
expect(listData.runs[0]).toMatchObject({
712720
filename: `remote::${runId}`,
713721
source: 'remote',
714722
experiment: 'green-uat',
715723
pass_rate: 1,
724+
avg_score: 1,
716725
});
717726

718727
const detailRes = await app.request(
@@ -749,6 +758,8 @@ describe('serve app', () => {
749758

750759
const runManifestPath = path.join(
751760
cloneDir,
761+
'.agentv',
762+
'results',
752763
'runs',
753764
'external-sync',
754765
'2026-03-26T11-00-00-000Z',

apps/studio/src/components/ExperimentDetail.tsx

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
projectRunListOptions,
1414
runListOptions,
1515
} from '~/lib/api';
16+
import { dedupeSyncedRuns } from '~/lib/run-dedupe';
1617

1718
import { RunList } from './RunList';
1819

@@ -45,12 +46,12 @@ export function ExperimentDetail({ experimentName, projectId }: ExperimentDetail
4546
}
4647

4748
const experiment = experimentsData?.experiments?.find((entry) => entry.name === experimentName);
48-
const runs = (runListData?.runs ?? []).filter(
49-
(run) => (run.experiment ?? 'default') === experimentName,
49+
const runs = dedupeSyncedRuns(
50+
(runListData?.runs ?? []).filter((run) => (run.experiment ?? 'default') === experimentName),
5051
);
5152

5253
const passRate = experiment?.pass_rate ?? 0;
53-
const runCount = experiment?.run_count ?? runs.length;
54+
const runCount = runs.length;
5455
const targetCount = experiment?.target_count ?? 0;
5556

5657
return (

apps/studio/src/components/TargetsTab.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import {
1515
runListOptions,
1616
targetsOptions,
1717
} from '~/lib/api';
18+
import { dedupeSyncedRuns } from '~/lib/run-dedupe';
1819
import type { RunMeta, TargetsResponse } from '~/lib/types';
1920

2021
import { PassRatePill } from './PassRatePill';
@@ -68,7 +69,7 @@ export function TargetsTab({ projectId }: TargetsTabProps = {}) {
6869
}
6970

7071
return [...groups.entries()]
71-
.map(([name, experimentRuns]) => buildExperimentGroup(name, experimentRuns))
72+
.map(([name, experimentRuns]) => buildExperimentGroup(name, dedupeSyncedRuns(experimentRuns)))
7273
.sort((a, b) => {
7374
if (a.latestTimestamp && b.latestTimestamp && a.latestTimestamp !== b.latestTimestamp) {
7475
return b.latestTimestamp.localeCompare(a.latestTimestamp);
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { describe, expect, it } from 'bun:test';
2+
3+
import { dedupeSyncedRuns } from './run-dedupe';
4+
import type { RunMeta } from './types';
5+
6+
function run(filename: string, source: RunMeta['source']): RunMeta {
7+
return {
8+
filename,
9+
display_name: filename,
10+
path: `/tmp/${filename}`,
11+
timestamp: '2026-05-28T08:21:09.063Z',
12+
test_count: 8,
13+
pass_rate: 1,
14+
avg_score: 1,
15+
size_bytes: 1024,
16+
source,
17+
};
18+
}
19+
20+
describe('dedupeSyncedRuns', () => {
21+
it('collapses local and remote copies of the same run in all-runs views', () => {
22+
const runs = [
23+
run('remote::2026-05-28T08-21-09-063Z', 'remote'),
24+
run('2026-05-28T08-21-09-063Z', 'local'),
25+
run('remote::2026-05-27T08-21-09-063Z', 'remote'),
26+
];
27+
28+
expect(dedupeSyncedRuns(runs).map((r) => r.filename)).toEqual([
29+
'2026-05-28T08-21-09-063Z',
30+
'remote::2026-05-27T08-21-09-063Z',
31+
]);
32+
});
33+
});

apps/studio/src/lib/run-dedupe.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import type { RunMeta } from './types';
2+
3+
const REMOTE_RUN_PREFIX = 'remote::';
4+
5+
function canonicalRunId(filename: string): string {
6+
return filename.startsWith(REMOTE_RUN_PREFIX)
7+
? filename.slice(REMOTE_RUN_PREFIX.length)
8+
: filename;
9+
}
10+
11+
export function dedupeSyncedRuns(runs: readonly RunMeta[]): RunMeta[] {
12+
const byRunId = new Map<string, RunMeta>();
13+
14+
for (const run of runs) {
15+
const key = canonicalRunId(run.filename);
16+
const existing = byRunId.get(key);
17+
if (!existing || (existing.source === 'remote' && run.source === 'local')) {
18+
byRunId.set(key, run);
19+
}
20+
}
21+
22+
return [...byRunId.values()];
23+
}

apps/studio/src/routes/index.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import {
3333
resolveIndexRoute,
3434
resolveInitialProjectRedirect,
3535
} from '~/lib/navigation';
36+
import { dedupeSyncedRuns } from '~/lib/run-dedupe';
3637
import type { RunMeta } from '~/lib/types';
3738
type TabId = StudioTabId;
3839

@@ -234,7 +235,7 @@ function SingleProjectHome() {
234235
const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'experiments';
235236
const filteredRuns =
236237
sourceFilter === 'all'
237-
? (data?.runs ?? [])
238+
? dedupeSyncedRuns(data?.runs ?? [])
238239
: (data?.runs ?? []).filter((run) => run.source === sourceFilter);
239240

240241
async function handleSyncRemote() {

apps/studio/src/routes/projects/$projectId.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
useRemoteStatus,
2323
useStudioConfig,
2424
} from '~/lib/api';
25+
import { dedupeSyncedRuns } from '~/lib/run-dedupe';
2526

2627
type TabId = 'runs' | 'experiments' | 'analytics' | 'targets';
2728

@@ -121,7 +122,7 @@ function ProjectRunsTab({ projectId }: { projectId: string }) {
121122

122123
const filteredRuns =
123124
sourceFilter === 'all'
124-
? (data?.runs ?? [])
125+
? dedupeSyncedRuns(data?.runs ?? [])
125126
: (data?.runs ?? []).filter((run) => run.source === sourceFilter);
126127

127128
async function handleSyncRemote() {

packages/core/src/evaluation/results-repo.ts

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import { getAgentvHome } from '../paths.js';
1717
import type { ResultsConfig } from './loaders/config-loader.js';
1818

1919
const execFileAsync = promisify(execFile);
20+
const RESULTS_REPO_RESULTS_DIR = '.agentv/results';
21+
const RESULTS_REPO_RUNS_DIR = `${RESULTS_REPO_RESULTS_DIR}/runs`;
2022

2123
export interface ResultsRepoLocalPaths {
2224
readonly rootDir: string;
@@ -345,7 +347,7 @@ export async function stageResultsArtifacts(params: {
345347

346348
export function resolveResultsRepoRunsDir(config: ResultsConfig): string {
347349
const normalized = normalizeResultsConfig(config);
348-
return path.join(normalized.path, 'runs');
350+
return path.join(normalized.path, RESULTS_REPO_RESULTS_DIR, 'runs');
349351
}
350352

351353
export async function directorySizeBytes(targetPath: string): Promise<number> {
@@ -443,7 +445,12 @@ export async function directPushResults(params: {
443445
const baseBranch = await resolveDefaultBranch(repoDir);
444446
await fetchResultsRepo(repoDir);
445447

446-
const destinationDir = path.join(repoDir, 'runs', params.destinationPath);
448+
const destinationDir = path.join(
449+
repoDir,
450+
RESULTS_REPO_RESULTS_DIR,
451+
'runs',
452+
params.destinationPath,
453+
);
447454
await stageResultsArtifacts({
448455
repoDir,
449456
sourceDir: params.sourceDir,
@@ -655,9 +662,12 @@ function parseGitBatchBlobs(output: Buffer): GitBatchBlob[] {
655662
}
656663

657664
export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise<GitListedRun[]> {
658-
const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, 'runs'], {
659-
cwd: repoDir,
660-
});
665+
const { stdout: treeOut } = await runGit(
666+
['ls-tree', '-r', '--name-only', ref, RESULTS_REPO_RUNS_DIR],
667+
{
668+
cwd: repoDir,
669+
},
670+
);
661671

662672
const benchmarkPaths = treeOut
663673
.split(/\r?\n/)
@@ -679,7 +689,7 @@ export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise
679689
const benchmarkPath = benchmarkPaths[index];
680690
const benchmark = JSON.parse(blob.content.toString('utf8')) as GitRunBenchmark;
681691
const runDir = path.posix.dirname(benchmarkPath);
682-
const relativeRunPath = path.posix.relative('runs', runDir);
692+
const relativeRunPath = path.posix.relative(RESULTS_REPO_RUNS_DIR, runDir);
683693
const runId = buildGitRunId(relativeRunPath);
684694
const timestamp = benchmark.metadata?.timestamp?.trim() || path.posix.basename(runDir);
685695
const targets = benchmark.metadata?.targets ?? [];
@@ -712,7 +722,7 @@ export async function materializeGitRun(
712722
ref = 'origin/main',
713723
): Promise<void> {
714724
const normalizedRunPath = relativeRunPath.split(path.sep).join('/');
715-
const runTreePath = path.posix.join('runs', normalizedRunPath);
725+
const runTreePath = path.posix.join(RESULTS_REPO_RUNS_DIR, normalizedRunPath);
716726
const targetRunDir = path.join(repoDir, ...runTreePath.split('/'));
717727
const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, runTreePath], {
718728
cwd: repoDir,

0 commit comments

Comments
 (0)