Skip to content

Commit 4d0defc

Browse files
authored
feat(evaluation): final-answer output with trace artifacts (#1364)
* feat(trace): add canonical evaluation trace model * feat(evaluation): score final output with full trace * feat(cli): write answer and transcript artifacts * chore: remove repo-local ntm artifacts * chore(targets): remove duplicate pi sdk openai target * fix(evaluation): stabilize final output trace contract * fix(evaluation): pass final output to prompt templates
1 parent f0259a2 commit 4d0defc

44 files changed

Lines changed: 1364 additions & 535 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.agentv/targets.yaml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,6 @@ targets:
9393
thinking: low
9494
stream_log: raw
9595

96-
- name: pi-sdk-openai
97-
provider: pi-coding-agent
98-
subprovider: openai
99-
base_url: ${{ OPENAI_ENDPOINT }}
100-
api_key: ${{ OPENAI_API_KEY }}
101-
model: gpt-5.5
102-
grader_target: openai
103-
thinking: low
104-
stream_log: raw
10596

10697
- name: pi-azure
10798
provider: pi-cli

.ntm/palette.md

Lines changed: 0 additions & 17 deletions
This file was deleted.

.ntm/personas.toml

Lines changed: 0 additions & 8 deletions
This file was deleted.

apps/cli/src/commands/eval/artifact-writer.ts

Lines changed: 100 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@ import {
66
type EvalTest,
77
type EvaluationResult,
88
type GraderResult,
9+
type Message,
910
type TargetDefinition,
10-
toTranscriptJsonLines,
11+
type TraceSummary,
12+
buildTraceFromMessages,
13+
extractLastAssistantContent,
14+
traceToTranscriptJsonLines,
1115
} from '@agentv/core';
1216
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
1317
import { RESULT_INDEX_FILENAME } from './result-layout.js';
@@ -195,7 +199,10 @@ export interface IndexArtifactEntry {
195199
readonly grading_path: string;
196200
readonly timing_path: string;
197201
readonly output_path?: string;
202+
readonly answer_path?: string;
203+
readonly transcript_path?: string;
198204
readonly input_path?: string;
205+
/** @deprecated Use output_path/answer_path for the final answer. */
199206
readonly response_path?: string;
200207
readonly task_dir?: string;
201208
readonly eval_path?: string;
@@ -245,23 +252,8 @@ function countToolCalls(result: EvaluationResult): {
245252
toolCalls: Record<string, number>;
246253
total: number;
247254
} {
248-
const toolCalls: Record<string, number> = {};
249-
let total = 0;
250-
251-
const trace = result.trace as
252-
| { steps?: readonly { toolName?: string; type?: string }[] }
253-
| undefined;
254-
255-
if (trace?.steps) {
256-
for (const step of trace.steps) {
257-
if (step.toolName || step.type === 'tool') {
258-
const name = step.toolName ?? 'unknown';
259-
toolCalls[name] = (toolCalls[name] ?? 0) + 1;
260-
total += 1;
261-
}
262-
}
263-
}
264-
255+
const toolCalls = { ...(result.trace?.toolCalls ?? {}) };
256+
const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0);
265257
return { toolCalls, total };
266258
}
267259

@@ -365,9 +357,8 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
365357
workspace_changes: parseWorkspaceChanges(result.fileChanges),
366358
conversation: result.conversationId
367359
? {
368-
turns: result.trace
369-
? ((result.trace as { steps?: readonly unknown[] }).steps?.length ?? 0)
370-
: 0,
360+
turns:
361+
result.trace?.messages.filter((message) => message.role === 'assistant').length ?? 0,
371362
conversation_id: result.conversationId,
372363
}
373364
: undefined,
@@ -661,7 +652,10 @@ export function buildIndexArtifactEntry(
661652
gradingPath: string;
662653
timingPath: string;
663654
outputPath?: string;
655+
answerPath?: string;
656+
transcriptPath?: string;
664657
inputPath?: string;
658+
responsePath?: string;
665659
taskBundle?: MaterializedTaskBundlePaths;
666660
},
667661
): IndexArtifactEntry {
@@ -689,9 +683,18 @@ export function buildIndexArtifactEntry(
689683
output_path: options.outputPath
690684
? toRelativeArtifactPath(options.outputDir, options.outputPath)
691685
: undefined,
686+
answer_path: options.answerPath
687+
? toRelativeArtifactPath(options.outputDir, options.answerPath)
688+
: undefined,
689+
transcript_path: options.transcriptPath
690+
? toRelativeArtifactPath(options.outputDir, options.transcriptPath)
691+
: undefined,
692692
input_path: options.inputPath
693693
? toRelativeArtifactPath(options.outputDir, options.inputPath)
694694
: undefined,
695+
response_path: options.responsePath
696+
? toRelativeArtifactPath(options.outputDir, options.responsePath)
697+
: undefined,
695698
...buildTaskBundleIndexFields(options.outputDir, options.taskBundle),
696699
metadata: result.metadata,
697700
};
@@ -703,7 +706,8 @@ export function buildResultIndexArtifact(
703706
): ResultIndexArtifact {
704707
const artifactSubdir = buildArtifactSubdir(result);
705708
const input = extractInput(result);
706-
const hasResponse = Array.isArray(result.output) && result.output.length > 0;
709+
const hasAnswer = result.output.length > 0;
710+
const hasTranscript = result.trace.messages.length > 0 || result.trace.events.length > 0;
707711

708712
return {
709713
timestamp: result.timestamp,
@@ -725,10 +729,12 @@ export function buildResultIndexArtifact(
725729
grading_path: path.posix.join(artifactSubdir, 'grading.json'),
726730
timing_path: path.posix.join(artifactSubdir, 'timing.json'),
727731
input_path: input ? path.posix.join(artifactSubdir, 'input.md') : undefined,
728-
output_path: hasResponse
729-
? path.posix.join(artifactSubdir, 'outputs', 'response.md')
732+
output_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
733+
answer_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
734+
transcript_path: hasTranscript
735+
? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl')
730736
: undefined,
731-
response_path: hasResponse
737+
response_path: hasAnswer
732738
? path.posix.join(artifactSubdir, 'outputs', 'response.md')
733739
: undefined,
734740
...(taskBundle
@@ -756,6 +762,16 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr
756762
await writeFile(filePath, content, 'utf8');
757763
}
758764

765+
async function writeTranscriptJsonl(filePath: string, result: EvaluationResult): Promise<void> {
766+
const lines = traceToTranscriptJsonLines(result.trace, {
767+
testId: result.testId,
768+
target: result.target,
769+
});
770+
const content =
771+
lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join('\n')}\n` : '';
772+
await writeFile(filePath, content, 'utf8');
773+
}
774+
759775
function isRecord(value: unknown): value is Record<string, unknown> {
760776
return typeof value === 'object' && value !== null && !Array.isArray(value);
761777
}
@@ -852,6 +868,7 @@ type ParsedEvaluationResult = Record<string, unknown> & {
852868
assertions: EvaluationResult['assertions'];
853869
target: string;
854870
output: EvaluationResult['output'];
871+
trace: EvaluationResult['trace'];
855872
executionStatus: EvaluationResult['executionStatus'];
856873
};
857874

@@ -874,7 +891,7 @@ function isAssertionEntry(value: unknown): value is EvaluationResult['assertions
874891
);
875892
}
876893

877-
function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
894+
function isOutputMessage(value: unknown): value is Message {
878895
if (!value || typeof value !== 'object' || Array.isArray(value)) {
879896
return false;
880897
}
@@ -890,20 +907,56 @@ function isExecutionStatus(value: unknown): value is EvaluationResult['execution
890907
);
891908
}
892909

910+
function isTraceRecord(value: unknown): value is EvaluationResult['trace'] {
911+
return (
912+
!!value &&
913+
typeof value === 'object' &&
914+
!Array.isArray(value) &&
915+
Array.isArray((value as { messages?: unknown }).messages) &&
916+
Array.isArray((value as { events?: unknown }).events)
917+
);
918+
}
919+
893920
function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
894921
if (!value || typeof value !== 'object' || Array.isArray(value)) {
895922
return undefined;
896923
}
897924

898925
const result = value as Record<string, unknown>;
926+
const legacyOutputMessages = Array.isArray(result.output)
927+
? result.output.filter(isOutputMessage)
928+
: undefined;
929+
const output =
930+
typeof result.output === 'string'
931+
? result.output
932+
: extractLastAssistantContent(legacyOutputMessages);
933+
const legacySummary =
934+
result.trace && typeof result.trace === 'object' && !Array.isArray(result.trace)
935+
? (result.trace as TraceSummary)
936+
: undefined;
937+
const trace = isTraceRecord(result.trace)
938+
? result.trace
939+
: buildTraceFromMessages({
940+
input: Array.isArray(result.input) ? (result.input as EvaluationResult['input']) : [],
941+
output: legacyOutputMessages,
942+
summary: legacySummary,
943+
finalOutput: output,
944+
tokenUsage: result.tokenUsage as EvaluationResult['tokenUsage'],
945+
costUsd: typeof result.costUsd === 'number' ? result.costUsd : undefined,
946+
durationMs: typeof result.durationMs === 'number' ? result.durationMs : undefined,
947+
target: typeof result.target === 'string' ? result.target : undefined,
948+
testId: typeof result.testId === 'string' ? result.testId : undefined,
949+
});
950+
899951
return {
900952
...result,
901953
timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
902954
testId: typeof result.testId === 'string' ? result.testId : 'unknown',
903955
score: typeof result.score === 'number' ? result.score : 0,
904956
assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
905957
target: typeof result.target === 'string' ? result.target : 'unknown',
906-
output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
958+
output,
959+
trace,
907960
executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
908961
};
909962
}
@@ -959,23 +1012,10 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
9591012
const lines: string[] = [];
9601013

9611014
for (const result of results) {
962-
const transcriptLines = toTranscriptJsonLines(
963-
{
964-
messages: [...(result.input ?? []), ...result.output],
965-
source: {
966-
provider: result.target,
967-
sessionId: result.conversationId ?? result.testId,
968-
startedAt: result.timestamp,
969-
},
970-
tokenUsage: result.tokenUsage,
971-
durationMs: result.durationMs,
972-
costUsd: result.costUsd,
973-
},
974-
{
975-
testId: result.testId,
976-
target: result.target,
977-
},
978-
);
1015+
const transcriptLines = traceToTranscriptJsonLines(result.trace, {
1016+
testId: result.testId,
1017+
target: result.target,
1018+
});
9791019

9801020
lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
9811021
}
@@ -1085,14 +1125,16 @@ export async function writePerTestArtifacts(
10851125
if (input) {
10861126
await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
10871127
}
1088-
if (result.output && result.output.length > 0) {
1128+
if (result.output.length > 0 || result.trace.messages.length > 0) {
10891129
const outputsDir = path.join(testDir, 'outputs');
10901130
await mkdir(outputsDir, { recursive: true });
1091-
await writeFile(
1092-
path.join(outputsDir, 'response.md'),
1093-
formatOutputMarkdown(result.output),
1094-
'utf8',
1095-
);
1131+
if (result.output.length > 0) {
1132+
await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
1133+
// Deprecated compatibility alias. New consumers should use answer.md
1134+
// for scored output or transcript.jsonl for the full execution record.
1135+
await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
1136+
}
1137+
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
10961138
}
10971139

10981140
const taskBundle = await materializeTaskBundleForResult({
@@ -1156,14 +1198,16 @@ export async function writeArtifactsFromResults(
11561198
await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
11571199
}
11581200

1159-
if (result.output && result.output.length > 0) {
1201+
if (result.output.length > 0 || result.trace.messages.length > 0) {
11601202
const outputsDir = path.join(testDir, 'outputs');
11611203
await mkdir(outputsDir, { recursive: true });
1162-
await writeFile(
1163-
path.join(outputsDir, 'response.md'),
1164-
formatOutputMarkdown(result.output),
1165-
'utf8',
1166-
);
1204+
if (result.output.length > 0) {
1205+
await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
1206+
// Deprecated compatibility alias. New consumers should use answer.md
1207+
// for scored output or transcript.jsonl for the full execution record.
1208+
await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
1209+
}
1210+
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
11671211
}
11681212

11691213
const taskBundle = await materializeTaskBundleForResult({

apps/cli/src/commands/eval/commands/assert.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import path from 'node:path';
33
import { command, option, optional, positional, string } from 'cmd-ts';
44
import fg from 'fast-glob';
55

6-
import { executeScript } from '@agentv/core';
6+
import { buildTraceFromMessages, executeScript } from '@agentv/core';
77

88
export const evalAssertCommand = command({
99
name: 'assert',
@@ -64,17 +64,26 @@ export const evalAssertCommand = command({
6464

6565
// Build payload matching CodeGrader's expected format (snake_case).
6666
// Include all fields that defineCodeGrader validates as required.
67+
const messages = [{ role: 'assistant' as const, content: resolvedOutput }];
68+
const inputMessages = [{ role: 'user' as const, content: resolvedInput }];
69+
const trace = buildTraceFromMessages({
70+
input: inputMessages,
71+
output: messages,
72+
finalOutput: resolvedOutput,
73+
});
6774
const payload = JSON.stringify(
6875
{
69-
output: [{ role: 'assistant', content: resolvedOutput }],
70-
input: [{ role: 'user', content: resolvedInput }],
76+
output: resolvedOutput,
77+
answer: resolvedOutput,
78+
messages,
79+
input: inputMessages,
7180
question: resolvedInput,
7281
criteria: '',
7382
expected_output: [],
7483
reference_answer: '',
7584

7685
input_files: [],
77-
trace: null,
86+
trace,
7887
token_usage: null,
7988
cost_usd: null,
8089
duration_ms: null,

0 commit comments

Comments
 (0)