@@ -6,8 +6,12 @@ import {
66 type EvalTest ,
77 type EvaluationResult ,
88 type GraderResult ,
9+ type Message ,
910 type TargetDefinition ,
10- toTranscriptJsonLines ,
11+ type TraceSummary ,
12+ buildTraceFromMessages ,
13+ extractLastAssistantContent ,
14+ traceToTranscriptJsonLines ,
1115} from '@agentv/core' ;
1216import { toSnakeCaseDeep } from '../../utils/case-conversion.js' ;
1317import { RESULT_INDEX_FILENAME } from './result-layout.js' ;
@@ -195,7 +199,10 @@ export interface IndexArtifactEntry {
195199 readonly grading_path : string ;
196200 readonly timing_path : string ;
197201 readonly output_path ?: string ;
202+ readonly answer_path ?: string ;
203+ readonly transcript_path ?: string ;
198204 readonly input_path ?: string ;
205+ /** @deprecated Use output_path/answer_path for the final answer. */
199206 readonly response_path ?: string ;
200207 readonly task_dir ?: string ;
201208 readonly eval_path ?: string ;
@@ -245,23 +252,8 @@ function countToolCalls(result: EvaluationResult): {
245252 toolCalls : Record < string , number > ;
246253 total : number ;
247254} {
248- const toolCalls : Record < string , number > = { } ;
249- let total = 0 ;
250-
251- const trace = result . trace as
252- | { steps ?: readonly { toolName ?: string ; type ?: string } [ ] }
253- | undefined ;
254-
255- if ( trace ?. steps ) {
256- for ( const step of trace . steps ) {
257- if ( step . toolName || step . type === 'tool' ) {
258- const name = step . toolName ?? 'unknown' ;
259- toolCalls [ name ] = ( toolCalls [ name ] ?? 0 ) + 1 ;
260- total += 1 ;
261- }
262- }
263- }
264-
255+ const toolCalls = { ...( result . trace ?. toolCalls ?? { } ) } ;
256+ const total = Object . values ( toolCalls ) . reduce ( ( sum , count ) => sum + count , 0 ) ;
265257 return { toolCalls, total } ;
266258}
267259
@@ -365,9 +357,8 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
365357 workspace_changes : parseWorkspaceChanges ( result . fileChanges ) ,
366358 conversation : result . conversationId
367359 ? {
368- turns : result . trace
369- ? ( ( result . trace as { steps ?: readonly unknown [ ] } ) . steps ?. length ?? 0 )
370- : 0 ,
360+ turns :
361+ result . trace ?. messages . filter ( ( message ) => message . role === 'assistant' ) . length ?? 0 ,
371362 conversation_id : result . conversationId ,
372363 }
373364 : undefined ,
@@ -661,7 +652,10 @@ export function buildIndexArtifactEntry(
661652 gradingPath : string ;
662653 timingPath : string ;
663654 outputPath ?: string ;
655+ answerPath ?: string ;
656+ transcriptPath ?: string ;
664657 inputPath ?: string ;
658+ responsePath ?: string ;
665659 taskBundle ?: MaterializedTaskBundlePaths ;
666660 } ,
667661) : IndexArtifactEntry {
@@ -689,9 +683,18 @@ export function buildIndexArtifactEntry(
689683 output_path : options . outputPath
690684 ? toRelativeArtifactPath ( options . outputDir , options . outputPath )
691685 : undefined ,
686+ answer_path : options . answerPath
687+ ? toRelativeArtifactPath ( options . outputDir , options . answerPath )
688+ : undefined ,
689+ transcript_path : options . transcriptPath
690+ ? toRelativeArtifactPath ( options . outputDir , options . transcriptPath )
691+ : undefined ,
692692 input_path : options . inputPath
693693 ? toRelativeArtifactPath ( options . outputDir , options . inputPath )
694694 : undefined ,
695+ response_path : options . responsePath
696+ ? toRelativeArtifactPath ( options . outputDir , options . responsePath )
697+ : undefined ,
695698 ...buildTaskBundleIndexFields ( options . outputDir , options . taskBundle ) ,
696699 metadata : result . metadata ,
697700 } ;
@@ -703,7 +706,8 @@ export function buildResultIndexArtifact(
703706) : ResultIndexArtifact {
704707 const artifactSubdir = buildArtifactSubdir ( result ) ;
705708 const input = extractInput ( result ) ;
706- const hasResponse = Array . isArray ( result . output ) && result . output . length > 0 ;
709+ const hasAnswer = result . output . length > 0 ;
710+ const hasTranscript = result . trace . messages . length > 0 || result . trace . events . length > 0 ;
707711
708712 return {
709713 timestamp : result . timestamp ,
@@ -725,10 +729,12 @@ export function buildResultIndexArtifact(
725729 grading_path : path . posix . join ( artifactSubdir , 'grading.json' ) ,
726730 timing_path : path . posix . join ( artifactSubdir , 'timing.json' ) ,
727731 input_path : input ? path . posix . join ( artifactSubdir , 'input.md' ) : undefined ,
728- output_path : hasResponse
729- ? path . posix . join ( artifactSubdir , 'outputs' , 'response.md' )
732+ output_path : hasAnswer ? path . posix . join ( artifactSubdir , 'outputs' , 'answer.md' ) : undefined ,
733+ answer_path : hasAnswer ? path . posix . join ( artifactSubdir , 'outputs' , 'answer.md' ) : undefined ,
734+ transcript_path : hasTranscript
735+ ? path . posix . join ( artifactSubdir , 'outputs' , 'transcript.jsonl' )
730736 : undefined ,
731- response_path : hasResponse
737+ response_path : hasAnswer
732738 ? path . posix . join ( artifactSubdir , 'outputs' , 'response.md' )
733739 : undefined ,
734740 ...( taskBundle
@@ -756,6 +762,16 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr
756762 await writeFile ( filePath , content , 'utf8' ) ;
757763}
758764
765+ async function writeTranscriptJsonl ( filePath : string , result : EvaluationResult ) : Promise < void > {
766+ const lines = traceToTranscriptJsonLines ( result . trace , {
767+ testId : result . testId ,
768+ target : result . target ,
769+ } ) ;
770+ const content =
771+ lines . length > 0 ? `${ lines . map ( ( line ) => JSON . stringify ( line ) ) . join ( '\n' ) } \n` : '' ;
772+ await writeFile ( filePath , content , 'utf8' ) ;
773+ }
774+
759775function isRecord ( value : unknown ) : value is Record < string , unknown > {
760776 return typeof value === 'object' && value !== null && ! Array . isArray ( value ) ;
761777}
@@ -852,6 +868,7 @@ type ParsedEvaluationResult = Record<string, unknown> & {
852868 assertions : EvaluationResult [ 'assertions' ] ;
853869 target : string ;
854870 output : EvaluationResult [ 'output' ] ;
871+ trace : EvaluationResult [ 'trace' ] ;
855872 executionStatus : EvaluationResult [ 'executionStatus' ] ;
856873} ;
857874
@@ -874,7 +891,7 @@ function isAssertionEntry(value: unknown): value is EvaluationResult['assertions
874891 ) ;
875892}
876893
877- function isOutputMessage ( value : unknown ) : value is EvaluationResult [ 'output' ] [ number ] {
894+ function isOutputMessage ( value : unknown ) : value is Message {
878895 if ( ! value || typeof value !== 'object' || Array . isArray ( value ) ) {
879896 return false ;
880897 }
@@ -890,20 +907,56 @@ function isExecutionStatus(value: unknown): value is EvaluationResult['execution
890907 ) ;
891908}
892909
910+ function isTraceRecord ( value : unknown ) : value is EvaluationResult [ 'trace' ] {
911+ return (
912+ ! ! value &&
913+ typeof value === 'object' &&
914+ ! Array . isArray ( value ) &&
915+ Array . isArray ( ( value as { messages ?: unknown } ) . messages ) &&
916+ Array . isArray ( ( value as { events ?: unknown } ) . events )
917+ ) ;
918+ }
919+
893920function normalizeParsedResult ( value : unknown ) : ParsedEvaluationResult | undefined {
894921 if ( ! value || typeof value !== 'object' || Array . isArray ( value ) ) {
895922 return undefined ;
896923 }
897924
898925 const result = value as Record < string , unknown > ;
926+ const legacyOutputMessages = Array . isArray ( result . output )
927+ ? result . output . filter ( isOutputMessage )
928+ : undefined ;
929+ const output =
930+ typeof result . output === 'string'
931+ ? result . output
932+ : extractLastAssistantContent ( legacyOutputMessages ) ;
933+ const legacySummary =
934+ result . trace && typeof result . trace === 'object' && ! Array . isArray ( result . trace )
935+ ? ( result . trace as TraceSummary )
936+ : undefined ;
937+ const trace = isTraceRecord ( result . trace )
938+ ? result . trace
939+ : buildTraceFromMessages ( {
940+ input : Array . isArray ( result . input ) ? ( result . input as EvaluationResult [ 'input' ] ) : [ ] ,
941+ output : legacyOutputMessages ,
942+ summary : legacySummary ,
943+ finalOutput : output ,
944+ tokenUsage : result . tokenUsage as EvaluationResult [ 'tokenUsage' ] ,
945+ costUsd : typeof result . costUsd === 'number' ? result . costUsd : undefined ,
946+ durationMs : typeof result . durationMs === 'number' ? result . durationMs : undefined ,
947+ target : typeof result . target === 'string' ? result . target : undefined ,
948+ testId : typeof result . testId === 'string' ? result . testId : undefined ,
949+ } ) ;
950+
899951 return {
900952 ...result ,
901953 timestamp : typeof result . timestamp === 'string' ? result . timestamp : new Date ( 0 ) . toISOString ( ) ,
902954 testId : typeof result . testId === 'string' ? result . testId : 'unknown' ,
903955 score : typeof result . score === 'number' ? result . score : 0 ,
904956 assertions : Array . isArray ( result . assertions ) ? result . assertions . filter ( isAssertionEntry ) : [ ] ,
905957 target : typeof result . target === 'string' ? result . target : 'unknown' ,
906- output : Array . isArray ( result . output ) ? result . output . filter ( isOutputMessage ) : [ ] ,
958+ output,
959+ trace,
907960 executionStatus : isExecutionStatus ( result . executionStatus ) ? result . executionStatus : 'ok' ,
908961 } ;
909962}
@@ -959,23 +1012,10 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
9591012 const lines : string [ ] = [ ] ;
9601013
9611014 for ( const result of results ) {
962- const transcriptLines = toTranscriptJsonLines (
963- {
964- messages : [ ...( result . input ?? [ ] ) , ...result . output ] ,
965- source : {
966- provider : result . target ,
967- sessionId : result . conversationId ?? result . testId ,
968- startedAt : result . timestamp ,
969- } ,
970- tokenUsage : result . tokenUsage ,
971- durationMs : result . durationMs ,
972- costUsd : result . costUsd ,
973- } ,
974- {
975- testId : result . testId ,
976- target : result . target ,
977- } ,
978- ) ;
1015+ const transcriptLines = traceToTranscriptJsonLines ( result . trace , {
1016+ testId : result . testId ,
1017+ target : result . target ,
1018+ } ) ;
9791019
9801020 lines . push ( ...transcriptLines . map ( ( line ) => JSON . stringify ( line ) ) ) ;
9811021 }
@@ -1085,14 +1125,16 @@ export async function writePerTestArtifacts(
10851125 if ( input ) {
10861126 await writeFile ( path . join ( testDir , 'input.md' ) , input , 'utf8' ) ;
10871127 }
1088- if ( result . output && result . output . length > 0 ) {
1128+ if ( result . output . length > 0 || result . trace . messages . length > 0 ) {
10891129 const outputsDir = path . join ( testDir , 'outputs' ) ;
10901130 await mkdir ( outputsDir , { recursive : true } ) ;
1091- await writeFile (
1092- path . join ( outputsDir , 'response.md' ) ,
1093- formatOutputMarkdown ( result . output ) ,
1094- 'utf8' ,
1095- ) ;
1131+ if ( result . output . length > 0 ) {
1132+ await writeFile ( path . join ( outputsDir , 'answer.md' ) , result . output , 'utf8' ) ;
1133+ // Deprecated compatibility alias. New consumers should use answer.md
1134+ // for scored output or transcript.jsonl for the full execution record.
1135+ await writeFile ( path . join ( outputsDir , 'response.md' ) , result . output , 'utf8' ) ;
1136+ }
1137+ await writeTranscriptJsonl ( path . join ( outputsDir , 'transcript.jsonl' ) , result ) ;
10961138 }
10971139
10981140 const taskBundle = await materializeTaskBundleForResult ( {
@@ -1156,14 +1198,16 @@ export async function writeArtifactsFromResults(
11561198 await writeFile ( path . join ( testDir , 'input.md' ) , input , 'utf8' ) ;
11571199 }
11581200
1159- if ( result . output && result . output . length > 0 ) {
1201+ if ( result . output . length > 0 || result . trace . messages . length > 0 ) {
11601202 const outputsDir = path . join ( testDir , 'outputs' ) ;
11611203 await mkdir ( outputsDir , { recursive : true } ) ;
1162- await writeFile (
1163- path . join ( outputsDir , 'response.md' ) ,
1164- formatOutputMarkdown ( result . output ) ,
1165- 'utf8' ,
1166- ) ;
1204+ if ( result . output . length > 0 ) {
1205+ await writeFile ( path . join ( outputsDir , 'answer.md' ) , result . output , 'utf8' ) ;
1206+ // Deprecated compatibility alias. New consumers should use answer.md
1207+ // for scored output or transcript.jsonl for the full execution record.
1208+ await writeFile ( path . join ( outputsDir , 'response.md' ) , result . output , 'utf8' ) ;
1209+ }
1210+ await writeTranscriptJsonl ( path . join ( outputsDir , 'transcript.jsonl' ) , result ) ;
11671211 }
11681212
11691213 const taskBundle = await materializeTaskBundleForResult ( {
0 commit comments