Skip to content

Commit fa83b2c

Browse files
committed
feat(cli): rerun captured task bundles
1 parent 35263cd commit fa83b2c

7 files changed

Lines changed: 1025 additions & 28 deletions

File tree

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 80 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ interface NormalizedOptions {
126126
readonly transcript?: string;
127127
readonly experiment?: string;
128128
readonly budgetUsd?: number;
129+
readonly sourceMetadataByEvalFile?: ReadonlyMap<string, Record<string, unknown>>;
129130
}
130131

131132
function normalizeBoolean(value: unknown): boolean {
@@ -197,6 +198,35 @@ function normalizeFilter(value: unknown): string | readonly string[] | undefined
197198
return normalizeString(value);
198199
}
199200

201+
function normalizeSourceMetadataByEvalFile(
202+
value: unknown,
203+
): ReadonlyMap<string, Record<string, unknown>> | undefined {
204+
if (value instanceof Map) {
205+
const entries = [...value.entries()].filter(
206+
(entry): entry is [string, Record<string, unknown>] =>
207+
typeof entry[0] === 'string' &&
208+
typeof entry[1] === 'object' &&
209+
entry[1] !== null &&
210+
!Array.isArray(entry[1]),
211+
);
212+
return entries.length > 0
213+
? new Map(entries.map(([key, metadata]) => [path.resolve(key), metadata]))
214+
: undefined;
215+
}
216+
217+
if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
218+
const entries = Object.entries(value).filter(
219+
(entry): entry is [string, Record<string, unknown>] =>
220+
typeof entry[1] === 'object' && entry[1] !== null && !Array.isArray(entry[1]),
221+
);
222+
return entries.length > 0
223+
? new Map(entries.map(([key, metadata]) => [path.resolve(key), metadata]))
224+
: undefined;
225+
}
226+
227+
return undefined;
228+
}
229+
200230
/**
201231
* Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
202232
*
@@ -404,9 +434,30 @@ function normalizeOptions(
404434
transcript: normalizeString(rawOptions.transcript),
405435
experiment: normalizeString(rawOptions.experiment),
406436
budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
437+
sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile(
438+
rawOptions.sourceMetadataByEvalFile,
439+
),
407440
} satisfies NormalizedOptions;
408441
}
409442

443+
function withSourceMetadata(
444+
result: EvaluationResult,
445+
testFilePath: string,
446+
options: NormalizedOptions,
447+
): EvaluationResult {
448+
const sourceMetadata = options.sourceMetadataByEvalFile?.get(path.resolve(testFilePath));
449+
if (!sourceMetadata) {
450+
return result;
451+
}
452+
return {
453+
...result,
454+
metadata: {
455+
...result.metadata,
456+
...sourceMetadata,
457+
},
458+
};
459+
}
460+
410461
async function ensureFileExists(filePath: string, description: string): Promise<void> {
411462
try {
412463
await access(filePath, constants.F_OK);
@@ -919,9 +970,10 @@ async function runSingleEvalFile(params: {
919970
// Trim output messages for results JSONL based on --output-messages.
920971
// Each message is trimmed to { role, content } only (no toolCalls, startTime, etc.).
921972
// Full output with tool calls goes to OTel.
922-
const trimmedOutput = trimOutputMessages(result.output, options.outputMessages);
973+
const resultWithMetadata = withSourceMetadata(result, testFilePath, options);
974+
const trimmedOutput = trimOutputMessages(resultWithMetadata.output, options.outputMessages);
923975
const trimmedResult: EvaluationResult = {
924-
...result,
976+
...resultWithMetadata,
925977
output: trimmedOutput,
926978
};
927979
await outputWriter.append(trimmedResult);
@@ -976,7 +1028,7 @@ async function runSingleEvalFile(params: {
9761028
},
9771029
});
9781030

979-
return { results: [...results] };
1031+
return { results: results.map((result) => withSourceMetadata(result, testFilePath, options)) };
9801032
}
9811033

9821034
export interface RunEvalResult {
@@ -1529,9 +1581,11 @@ export async function runEvalCommand(
15291581
target: selection.targetName,
15301582
}));
15311583
for (const r of skippedResults) {
1532-
await outputWriter.append(r);
1584+
await outputWriter.append(withSourceMetadata(r, testFilePath, options));
15331585
}
1534-
allResults.push(...skippedResults);
1586+
allResults.push(
1587+
...skippedResults.map((r) => withSourceMetadata(r, testFilePath, options)),
1588+
);
15351589
}
15361590
continue;
15371591
}
@@ -1614,21 +1668,27 @@ export async function runEvalCommand(
16141668
console.error(
16151669
`\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)}${message}\n`,
16161670
);
1617-
const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => ({
1618-
timestamp: new Date().toISOString(),
1619-
testId: testCase.id,
1620-
score: 0,
1621-
assertions: [],
1622-
output: [],
1623-
scores: [],
1624-
error: message,
1625-
executionStatus: 'execution_error' as const,
1626-
failureStage: 'setup' as const,
1627-
failureReasonCode: 'setup_error' as const,
1628-
durationMs: 0,
1629-
tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
1630-
target: selection.targetName,
1631-
}));
1671+
const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) =>
1672+
withSourceMetadata(
1673+
{
1674+
timestamp: new Date().toISOString(),
1675+
testId: testCase.id,
1676+
score: 0,
1677+
assertions: [],
1678+
output: [],
1679+
scores: [],
1680+
error: message,
1681+
executionStatus: 'execution_error' as const,
1682+
failureStage: 'setup' as const,
1683+
failureReasonCode: 'setup_error' as const,
1684+
durationMs: 0,
1685+
tokenUsage: { input: 0, output: 0 },
1686+
target: selection.targetName,
1687+
},
1688+
testFilePath,
1689+
options,
1690+
),
1691+
);
16321692
for (const errResult of errorResults) {
16331693
await outputWriter.append(errResult);
16341694
}

apps/cli/src/commands/results/manifest.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@ export interface ResultManifestRecord {
3333
readonly input_path?: string;
3434
readonly output_path?: string;
3535
readonly response_path?: string;
36+
readonly artifact_dir?: string;
3637
readonly task_dir?: string;
3738
readonly eval_path?: string;
3839
readonly targets_path?: string;
3940
readonly files_path?: string;
4041
readonly graders_path?: string;
42+
readonly metadata?: Record<string, unknown>;
4143
}
4244

4345
function parseJsonlLines<T>(content: string): T[] {
@@ -175,6 +177,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
175177
costUsd: record.cost_usd,
176178
input: hydrateInput(baseDir, record),
177179
output: hydrateOutput(baseDir, record),
180+
metadata: record.metadata,
178181
} as EvaluationResult;
179182
}
180183

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { subcommands } from 'cmd-ts';
2+
3+
import { runsRerunCommand } from './rerun.js';
4+
5+
export const runsCommand = subcommands({
6+
name: 'runs',
7+
description: 'Operate on captured run workspaces',
8+
cmds: {
9+
rerun: runsRerunCommand,
10+
},
11+
});

0 commit comments

Comments
 (0)