From 06a69e8543aac67596af1e01f0e7c3452636827e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 07:24:42 +0200 Subject: [PATCH 1/7] feat(trace): add canonical evaluation trace model --- packages/core/src/evaluation/trace.ts | 496 +++++++++++++++++++++----- packages/core/src/import/index.ts | 2 + packages/core/src/import/types.ts | 59 +++ packages/eval/src/index.ts | 37 +- packages/eval/src/schemas.ts | 107 +++++- 5 files changed, 598 insertions(+), 103 deletions(-) diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index d6d4f8210..1541961db 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -1,19 +1,21 @@ /** * Trace models for evaluation-time agent behavior. * - * This module separates the canonical trace contract from compatibility views: - * - NormalizedTrajectory is the full, versioned trajectory contract that importers, - * replay, and trajectory-aware graders should use as the source of truth. - * - TraceSummary is a derived compact read model used by existing graders, result - * artifacts, and CLI/dashboard aggregation. When a full trajectory exists, do - * not author TraceSummary independently; derive it with - * computeTraceSummaryFromTrajectory(). + * `Trace` is AgentV's canonical normalized execution model. Evaluation results + * keep `output` as the final answer/scored result only; the full transcript, + * tool calls/results, errors, timing, usage, provider/session provenance, and + * replay/eval metrics live in `trace`. * - * Keep TypeScript internals camelCase. Persisted trajectory artifacts use the - * snake_case NormalizedTrajectoryWire shape and must pass through the converters - * in this file. + * `TraceSummary` is a derived compact read model for metric-style graders and + * aggregation. Derive it from `Trace.messages`/`Trace.events`; do not treat it + * as the canonical trace. + * + * TypeScript internals are camelCase. Persisted JSON/JSONL wire shapes are + * snake_case and must pass through the converters in this file or a boundary + * serializer such as `toSnakeCaseDeep()`. */ import { z } from 'zod'; +import type { Message } from './providers/types.js'; export const NORMALIZED_TRAJECTORY_SCHEMA_VERSION = 'agentv.trace.v1' as const; @@ -32,19 +34,36 @@ export const NORMALIZED_TRACE_EVENT_TYPES = [ 'model_turn', 'tool_call', 'tool_result', + 'final_response', + 'error', ] as const; export const NORMALIZED_TOOL_STATUSES = ['ok', 'error', 'timeout', 'cancelled', 'unknown'] as const; export const NORMALIZED_REDACTION_LEVELS = ['none', 'partial', 'full'] as const; -export type NormalizedTraceSourceKind = (typeof NORMALIZED_TRACE_SOURCE_KINDS)[number]; -export type NormalizedTraceEventType = (typeof NORMALIZED_TRACE_EVENT_TYPES)[number]; -export type NormalizedToolStatus = (typeof NORMALIZED_TOOL_STATUSES)[number]; -export type NormalizedRedactionLevel = (typeof NORMALIZED_REDACTION_LEVELS)[number]; - -export interface NormalizedTraceSource { - readonly kind: NormalizedTraceSourceKind; +export const TRACE_SCHEMA_VERSION = NORMALIZED_TRAJECTORY_SCHEMA_VERSION; +export const TRACE_SOURCE_KINDS = NORMALIZED_TRACE_SOURCE_KINDS; +export const TRACE_EVENT_TYPES = NORMALIZED_TRACE_EVENT_TYPES; +export const TRACE_TOOL_STATUSES = NORMALIZED_TOOL_STATUSES; +export const TRACE_REDACTION_LEVELS = NORMALIZED_REDACTION_LEVELS; + +export type TraceSourceKind = (typeof TRACE_SOURCE_KINDS)[number]; +export type TraceEventType = (typeof TRACE_EVENT_TYPES)[number]; +export type TraceToolStatus = (typeof TRACE_TOOL_STATUSES)[number]; +export type TraceRedactionLevel = (typeof TRACE_REDACTION_LEVELS)[number]; + +/** @deprecated Use TraceSourceKind. */ +export type NormalizedTraceSourceKind = TraceSourceKind; +/** @deprecated Use TraceEventType. */ +export type NormalizedTraceEventType = TraceEventType; +/** @deprecated Use TraceToolStatus. */ +export type NormalizedToolStatus = TraceToolStatus; +/** @deprecated Use TraceRedactionLevel. */ +export type NormalizedRedactionLevel = TraceRedactionLevel; + +export interface TraceSource { + readonly kind: TraceSourceKind; readonly path?: string; readonly url?: string; readonly provider?: string; @@ -53,7 +72,7 @@ export interface NormalizedTraceSource { readonly metadata?: Readonly>; } -export interface NormalizedTraceSession { +export interface TraceSession { readonly sessionId?: string; readonly conversationId?: string; readonly cwd?: string; @@ -62,7 +81,7 @@ export interface NormalizedTraceSession { readonly metadata?: Readonly>; } -export interface NormalizedTraceBranch { +export interface TraceBranch { readonly selectedLeafId?: string; readonly selectedPathIds?: readonly string[]; readonly includedEventIds?: readonly string[]; @@ -70,7 +89,7 @@ export interface NormalizedTraceBranch { readonly selectionReason?: string; } -export interface NormalizedTraceSourceRef { +export interface TraceSourceRef { readonly eventId?: string; readonly messageId?: string; readonly spanId?: string; @@ -81,7 +100,7 @@ export interface NormalizedTraceSourceRef { readonly metadata?: Readonly>; } -export interface NormalizedRawEvidence { +export interface TraceRawEvidence { readonly kind: string; readonly ref?: string; readonly mediaType?: string; @@ -90,13 +109,13 @@ export interface NormalizedRawEvidence { readonly metadata?: Readonly>; } -export interface NormalizedRedactionState { - readonly level: NormalizedRedactionLevel; +export interface TraceRedactionState { + readonly level: TraceRedactionLevel; readonly fields?: readonly string[]; readonly reason?: string; } -export interface NormalizedTraceError { +export interface TraceError { readonly message: string; readonly name?: string; readonly code?: string; @@ -104,16 +123,16 @@ export interface NormalizedTraceError { readonly metadata?: Readonly>; } -export interface NormalizedTraceMessage { +export interface TraceMessage { readonly role: string; readonly name?: string; readonly content?: unknown; - readonly redaction?: NormalizedRedactionState; + readonly redaction?: TraceRedactionState; readonly tokenUsage?: TokenUsage; readonly metadata?: Readonly>; } -export interface NormalizedTraceModel { +export interface TraceModel { readonly provider?: string; readonly name?: string; readonly invocationId?: string; @@ -121,48 +140,49 @@ export interface NormalizedTraceModel { readonly metadata?: Readonly>; } -export interface NormalizedTraceTool { +export interface TraceTool { readonly name: string; readonly callId?: string; readonly input?: unknown; readonly output?: unknown; - readonly status?: NormalizedToolStatus; - readonly error?: NormalizedTraceError; - readonly redaction?: NormalizedRedactionState; + readonly status?: TraceToolStatus; + readonly error?: TraceError; + readonly redaction?: TraceRedactionState; readonly metadata?: Readonly>; } -export interface NormalizedTraceEvent { +export interface TraceEvent { readonly eventId: string; readonly parentEventId?: string; readonly ordinal: number; - readonly type: NormalizedTraceEventType; + readonly type: TraceEventType; readonly timestamp?: string; readonly durationMs?: number; readonly durationInferred?: boolean; readonly turnIndex?: number; - readonly message?: NormalizedTraceMessage; - readonly model?: NormalizedTraceModel; - readonly tool?: NormalizedTraceTool; - readonly sourceRef?: NormalizedTraceSourceRef; - readonly rawEvidence?: readonly NormalizedRawEvidence[]; - readonly redaction?: NormalizedRedactionState; + readonly message?: TraceMessage; + readonly model?: TraceModel; + readonly tool?: TraceTool; + readonly error?: TraceError; + readonly sourceRef?: TraceSourceRef; + readonly rawEvidence?: readonly TraceRawEvidence[]; + readonly redaction?: TraceRedactionState; readonly metadata?: Readonly>; } /** - * Canonical in-memory trajectory model. + * Legacy imported trace artifact shape used by older import/replay helpers. * - * Persisted trajectory artifacts are the snake_case wire shape below. They do - * not embed TraceSummary because compact summaries are one-way projections from - * this full event stream. + * New evaluation results use `Trace` below: final answer in `output`, full + * transcript under `trace.messages`, structured spans under `trace.events`, and + * provider-native session identifiers in `trace.metadata`. */ -export interface NormalizedTrajectory { +export interface TraceArtifact { readonly schemaVersion: typeof NORMALIZED_TRAJECTORY_SCHEMA_VERSION; - readonly source: NormalizedTraceSource; - readonly session: NormalizedTraceSession; - readonly branch?: NormalizedTraceBranch; - readonly events: readonly NormalizedTraceEvent[]; + readonly source: TraceSource; + readonly session: TraceSession; + readonly branch?: TraceBranch; + readonly events: readonly TraceEvent[]; readonly tokenUsage?: TokenUsage; readonly costUsd?: number; readonly durationMs?: number; @@ -171,6 +191,12 @@ export interface NormalizedTrajectory { readonly metadata?: Readonly>; } +/** + * @deprecated Use `Trace` for evaluation results or `TraceArtifact` for legacy + * import/replay artifacts. + */ +export type NormalizedTrajectory = TraceArtifact; + function omitUndefinedProperties>(value: T): T { return Object.fromEntries( Object.entries(value).filter(([, property]) => property !== undefined), @@ -286,6 +312,7 @@ export const NormalizedTraceEventWireSchema = z.object({ message: NormalizedTraceMessageWireSchema.optional(), model: NormalizedTraceModelWireSchema.optional(), tool: NormalizedTraceToolWireSchema.optional(), + error: NormalizedTraceErrorWireSchema.optional(), source_ref: NormalizedTraceSourceRefWireSchema.optional(), raw_evidence: z.array(NormalizedRawEvidenceWireSchema).optional(), redaction: NormalizedRedactionStateWireSchema.optional(), @@ -309,9 +336,23 @@ export const NormalizedTrajectoryWireSchema = z.object({ export type NormalizedTrajectoryWire = z.infer; export type NormalizedTraceEventWire = z.infer; -export function toNormalizedTrajectoryWire( - trajectory: NormalizedTrajectory, -): NormalizedTrajectoryWire { +export const TraceRedactionStateWireSchema = NormalizedRedactionStateWireSchema; +export const TraceErrorWireSchema = NormalizedTraceErrorWireSchema; +export const TraceSourceWireSchema = NormalizedTraceSourceWireSchema; +export const TraceSessionWireSchema = NormalizedTraceSessionWireSchema; +export const TraceBranchWireSchema = NormalizedTraceBranchWireSchema; +export const TraceSourceRefWireSchema = NormalizedTraceSourceRefWireSchema; +export const TraceRawEvidenceWireSchema = NormalizedRawEvidenceWireSchema; +export const TraceMessageWireSchema = NormalizedTraceMessageWireSchema; +export const TraceModelWireSchema = NormalizedTraceModelWireSchema; +export const TraceToolWireSchema = NormalizedTraceToolWireSchema; +export const TraceEventWireSchema = NormalizedTraceEventWireSchema; +export const TraceArtifactWireSchema = NormalizedTrajectoryWireSchema; + +export type TraceArtifactWire = NormalizedTrajectoryWire; +export type TraceEventWire = NormalizedTraceEventWire; + +export function toNormalizedTrajectoryWire(trajectory: TraceArtifact): NormalizedTrajectoryWire { return NormalizedTrajectoryWireSchema.parse( omitUndefinedProperties({ schema_version: trajectory.schemaVersion, @@ -329,7 +370,7 @@ export function toNormalizedTrajectoryWire( ); } -export function fromNormalizedTrajectoryWire(input: unknown): NormalizedTrajectory { +export function fromNormalizedTrajectoryWire(input: unknown): TraceArtifact { const wire = NormalizedTrajectoryWireSchema.parse(input); return { @@ -347,7 +388,15 @@ export function fromNormalizedTrajectoryWire(input: unknown): NormalizedTrajecto }; } -function toNormalizedTraceSourceWire(source: NormalizedTraceSource) { +export function toTraceArtifactWire(artifact: TraceArtifact): TraceArtifactWire { + return toNormalizedTrajectoryWire(artifact); +} + +export function fromTraceArtifactWire(input: unknown): TraceArtifact { + return fromNormalizedTrajectoryWire(input); +} + +function toNormalizedTraceSourceWire(source: TraceSource) { return omitUndefinedProperties({ kind: source.kind, path: source.path, @@ -361,7 +410,7 @@ function toNormalizedTraceSourceWire(source: NormalizedTraceSource) { function fromNormalizedTraceSourceWire( source: z.infer, -): NormalizedTraceSource { +): TraceSource { return { kind: source.kind, path: source.path, @@ -373,7 +422,7 @@ function fromNormalizedTraceSourceWire( }; } -function toNormalizedTraceSessionWire(session: NormalizedTraceSession) { +function toNormalizedTraceSessionWire(session: TraceSession) { return omitUndefinedProperties({ session_id: session.sessionId, conversation_id: session.conversationId, @@ -386,7 +435,7 @@ function toNormalizedTraceSessionWire(session: NormalizedTraceSession) { function fromNormalizedTraceSessionWire( session: z.infer, -): NormalizedTraceSession { +): TraceSession { return { sessionId: session.session_id, conversationId: session.conversation_id, @@ -397,7 +446,7 @@ function fromNormalizedTraceSessionWire( }; } -function toNormalizedTraceBranchWire(branch: NormalizedTraceBranch) { +function toNormalizedTraceBranchWire(branch: TraceBranch) { return omitUndefinedProperties({ selected_leaf_id: branch.selectedLeafId, selected_path_ids: branch.selectedPathIds, @@ -409,7 +458,7 @@ function toNormalizedTraceBranchWire(branch: NormalizedTraceBranch) { function fromNormalizedTraceBranchWire( branch: z.infer, -): NormalizedTraceBranch { +): TraceBranch { return { selectedLeafId: branch.selected_leaf_id, selectedPathIds: branch.selected_path_ids, @@ -419,7 +468,7 @@ function fromNormalizedTraceBranchWire( }; } -function toNormalizedTraceEventWire(event: NormalizedTraceEvent): NormalizedTraceEventWire { +function toNormalizedTraceEventWire(event: TraceEvent): NormalizedTraceEventWire { return NormalizedTraceEventWireSchema.parse( omitUndefinedProperties({ event_id: event.eventId, @@ -433,6 +482,7 @@ function toNormalizedTraceEventWire(event: NormalizedTraceEvent): NormalizedTrac message: event.message ? toNormalizedTraceMessageWire(event.message) : undefined, model: event.model ? toNormalizedTraceModelWire(event.model) : undefined, tool: event.tool ? toNormalizedTraceToolWire(event.tool) : undefined, + error: event.error ? toNormalizedTraceErrorWire(event.error) : undefined, source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : undefined, raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire), redaction: event.redaction, @@ -441,7 +491,7 @@ function toNormalizedTraceEventWire(event: NormalizedTraceEvent): NormalizedTrac ); } -function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): NormalizedTraceEvent { +function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): TraceEvent { return { eventId: event.event_id, parentEventId: event.parent_event_id, @@ -454,6 +504,7 @@ function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): Normaliz message: event.message ? fromNormalizedTraceMessageWire(event.message) : undefined, model: event.model ? fromNormalizedTraceModelWire(event.model) : undefined, tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : undefined, + error: event.error ? fromNormalizedTraceErrorWire(event.error) : undefined, sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : undefined, rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire), redaction: event.redaction, @@ -461,7 +512,7 @@ function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): Normaliz }; } -function toNormalizedTraceMessageWire(message: NormalizedTraceMessage) { +function toNormalizedTraceMessageWire(message: TraceMessage) { return omitUndefinedProperties({ role: message.role, name: message.name, @@ -474,7 +525,7 @@ function toNormalizedTraceMessageWire(message: NormalizedTraceMessage) { function fromNormalizedTraceMessageWire( message: z.infer, -): NormalizedTraceMessage { +): TraceMessage { return { role: message.role, name: message.name, @@ -485,7 +536,7 @@ function fromNormalizedTraceMessageWire( }; } -function toNormalizedTraceModelWire(model: NormalizedTraceModel) { +function toNormalizedTraceModelWire(model: TraceModel) { return omitUndefinedProperties({ provider: model.provider, name: model.name, @@ -497,7 +548,7 @@ function toNormalizedTraceModelWire(model: NormalizedTraceModel) { function fromNormalizedTraceModelWire( model: z.infer, -): NormalizedTraceModel { +): TraceModel { return { provider: model.provider, name: model.name, @@ -507,7 +558,7 @@ function fromNormalizedTraceModelWire( }; } -function toNormalizedTraceToolWire(tool: NormalizedTraceTool) { +function toNormalizedTraceToolWire(tool: TraceTool) { return omitUndefinedProperties({ name: tool.name, call_id: tool.callId, @@ -522,7 +573,7 @@ function toNormalizedTraceToolWire(tool: NormalizedTraceTool) { function fromNormalizedTraceToolWire( tool: z.infer, -): NormalizedTraceTool { +): TraceTool { return { name: tool.name, callId: tool.call_id, @@ -535,7 +586,29 @@ function fromNormalizedTraceToolWire( }; } -function toNormalizedTraceSourceRefWire(sourceRef: NormalizedTraceSourceRef) { +function toNormalizedTraceErrorWire(error: TraceError) { + return omitUndefinedProperties({ + message: error.message, + name: error.name, + code: error.code, + stack: error.stack, + metadata: error.metadata, + }); +} + +function fromNormalizedTraceErrorWire( + error: z.infer, +): TraceError { + return { + message: error.message, + name: error.name, + code: error.code, + stack: error.stack, + metadata: error.metadata, + }; +} + +function toNormalizedTraceSourceRefWire(sourceRef: TraceSourceRef) { return omitUndefinedProperties({ event_id: sourceRef.eventId, message_id: sourceRef.messageId, @@ -550,7 +623,7 @@ function toNormalizedTraceSourceRefWire(sourceRef: NormalizedTraceSourceRef) { function fromNormalizedTraceSourceRefWire( sourceRef: z.infer, -): NormalizedTraceSourceRef { +): TraceSourceRef { return { eventId: sourceRef.event_id, messageId: sourceRef.message_id, @@ -563,7 +636,7 @@ function fromNormalizedTraceSourceRefWire( }; } -function toNormalizedRawEvidenceWire(evidence: NormalizedRawEvidence) { +function toNormalizedRawEvidenceWire(evidence: TraceRawEvidence) { return omitUndefinedProperties({ kind: evidence.kind, ref: evidence.ref, @@ -576,7 +649,7 @@ function toNormalizedRawEvidenceWire(evidence: NormalizedRawEvidence) { function fromNormalizedRawEvidenceWire( evidence: z.infer, -): NormalizedRawEvidence { +): TraceRawEvidence { return { kind: evidence.kind, ref: evidence.ref, @@ -587,6 +660,32 @@ function fromNormalizedRawEvidenceWire( }; } +// Deprecated compatibility names retained for callers that imported the older +// normalized-trace terminology. New code should use the AgentV-owned Trace* +// names above. +/** @deprecated Use TraceSource. */ +export type NormalizedTraceSource = TraceSource; +/** @deprecated Use TraceSession. */ +export type NormalizedTraceSession = TraceSession; +/** @deprecated Use TraceBranch. */ +export type NormalizedTraceBranch = TraceBranch; +/** @deprecated Use TraceSourceRef. */ +export type NormalizedTraceSourceRef = TraceSourceRef; +/** @deprecated Use TraceRawEvidence. */ +export type NormalizedRawEvidence = TraceRawEvidence; +/** @deprecated Use TraceRedactionState. */ +export type NormalizedRedactionState = TraceRedactionState; +/** @deprecated Use TraceError. */ +export type NormalizedTraceError = TraceError; +/** @deprecated Use TraceMessage. */ +export type NormalizedTraceMessage = TraceMessage; +/** @deprecated Use TraceModel. */ +export type NormalizedTraceModel = TraceModel; +/** @deprecated Use TraceTool. */ +export type NormalizedTraceTool = TraceTool; +/** @deprecated Use TraceEvent. */ +export type NormalizedTraceEvent = TraceEvent; + /** * Token usage metrics from provider execution. */ @@ -605,8 +704,8 @@ export interface TokenUsage { * Derived compact summary of a trace for lightweight persistence. * * This is a compatibility/read model for existing result artifacts and - * aggregation. It is intentionally smaller than NormalizedTrajectory and should - * not be treated as independently authored trace state when a full trajectory is + * aggregation. It is intentionally smaller than Trace and should + * not be treated as independently authored trace state when a full Trace is * available. */ export interface TraceSummary { @@ -622,6 +721,247 @@ export interface TraceSummary { readonly llmCallCount?: number; } +/** + * Canonical trace attached to every evaluation result. + * + * The compact TraceSummary fields are mirrored for existing + * metric graders; `messages` and `events` are the complete canonical + * execution record. Result `output` is only the final answer; tools, + * intermediate assistant text, timing, usage, provider provenance, and replay + * metadata live here. + */ +export interface Trace extends TraceSummary { + readonly schemaVersion: typeof TRACE_SCHEMA_VERSION; + /** Complete normalized chat transcript used for transcript-aware graders. */ + readonly messages: readonly Message[]; + /** Structured event stream derived from the same messages and metrics. */ + readonly events: readonly TraceEvent[]; + readonly tokenUsage?: TokenUsage; + readonly costUsd?: number; + readonly durationMs?: number; + readonly startTime?: string; + readonly endTime?: string; + /** Provider/session/eval provenance. Provider-native IDs use metadata keys. */ + readonly metadata?: Readonly>; +} + +interface BuildTraceOptions { + readonly input?: readonly Message[]; + readonly output?: readonly Message[]; + readonly summary?: TraceSummary; + readonly finalOutput?: string; + readonly tokenUsage?: TokenUsage; + readonly costUsd?: number; + readonly durationMs?: number; + readonly startTime?: string; + readonly endTime?: string; + readonly provider?: string; + readonly target?: string; + readonly testId?: string; + readonly conversationId?: string; + readonly metadata?: Readonly>; + readonly error?: TraceError | string; +} + +function sameMessageContent(first: Message | undefined, second: Message | undefined): boolean { + if (!first || !second) return false; + return ( + first.role === second.role && JSON.stringify(first.content) === JSON.stringify(second.content) + ); +} + +function buildTraceMessages( + input: readonly Message[] | undefined, + output: readonly Message[] | undefined, +): readonly Message[] { + const outputMessages = output ?? []; + if (outputMessages.length === 0) { + return input ?? []; + } + + // Agent/transcript providers often return a full conversation (including the + // user/system turns). Single-shot LLM providers usually return only the final + // assistant message. Avoid duplicating the prompt when the provider already + // supplied a conversation-shaped transcript. + const outputLooksLikeFullTranscript = outputMessages.some( + (message) => message.role === 'user' || message.role === 'system', + ); + if (outputLooksLikeFullTranscript) { + return outputMessages; + } + + const inputMessages = input ?? []; + if ( + inputMessages.length === 1 && + outputMessages.length > 0 && + sameMessageContent(inputMessages[0], outputMessages[0]) + ) { + return outputMessages; + } + return [...inputMessages, ...outputMessages]; +} + +function toTraceMessage(message: Message): TraceMessage { + return { + role: message.role, + name: message.name, + content: message.content, + tokenUsage: message.tokenUsage, + metadata: message.metadata, + }; +} + +function toTraceError(error: TraceError | string): TraceError { + return typeof error === 'string' ? { message: error } : error; +} + +/** + * Build the canonical trace for an evaluation case from provider messages and + * execution metrics. This is the single projection used by result JSONL, + * code-grader stdin, `outputs/answer.md`, and `outputs/transcript.jsonl`. + */ +export function buildTraceFromMessages(options: BuildTraceOptions = {}): Trace { + const messages = buildTraceMessages(options.input, options.output); + const computed = computeTraceSummary(messages); + const summary = options.summary ?? computed.trace; + const events: TraceEvent[] = []; + let ordinal = 0; + + for (const [messageIndex, message] of messages.entries()) { + const eventId = `message-${messageIndex}`; + events.push({ + eventId, + ordinal: ordinal++, + type: 'message', + timestamp: message.startTime, + durationMs: message.durationMs, + message: toTraceMessage(message), + metadata: { message_index: messageIndex }, + }); + + for (const [toolIndex, toolCall] of (message.toolCalls ?? []).entries()) { + const toolEventId = `message-${messageIndex}-tool-${toolIndex}`; + events.push({ + eventId: toolEventId, + parentEventId: eventId, + ordinal: ordinal++, + type: 'tool_call', + timestamp: toolCall.startTime, + durationMs: toolCall.durationMs, + tool: { + name: toolCall.tool, + callId: toolCall.id, + input: toolCall.input, + output: toolCall.output, + status: 'ok', + }, + metadata: { + message_index: messageIndex, + tool_index: toolIndex, + }, + }); + + if (toolCall.output !== undefined) { + events.push({ + eventId: `${toolEventId}-result`, + parentEventId: toolEventId, + ordinal: ordinal++, + type: 'tool_result', + timestamp: toolCall.endTime, + tool: { + name: toolCall.tool, + callId: toolCall.id, + output: toolCall.output, + status: 'ok', + }, + metadata: { + message_index: messageIndex, + tool_index: toolIndex, + }, + }); + } + } + } + + const finalAssistantIndex = [...messages] + .map((message, index) => ({ message, index })) + .reverse() + .find((entry) => entry.message.role === 'assistant')?.index; + if (finalAssistantIndex !== undefined) { + const finalMessage = messages[finalAssistantIndex]; + events.push({ + eventId: 'final-response', + parentEventId: `message-${finalAssistantIndex}`, + ordinal: ordinal++, + type: 'final_response', + timestamp: finalMessage.endTime ?? finalMessage.startTime ?? options.endTime, + message: { + ...toTraceMessage(finalMessage), + content: options.finalOutput ?? finalMessage.content, + }, + metadata: { message_index: finalAssistantIndex }, + }); + } + + if (options.error) { + events.push({ + eventId: 'error', + ordinal: ordinal++, + type: 'error', + timestamp: options.endTime, + error: toTraceError(options.error), + }); + } + + return { + schemaVersion: TRACE_SCHEMA_VERSION, + eventCount: summary.eventCount, + toolCalls: summary.toolCalls, + errorCount: summary.errorCount + (options.error ? 1 : 0), + llmCallCount: summary.llmCallCount, + ...(summary.toolDurations ? { toolDurations: summary.toolDurations } : {}), + messages, + events, + tokenUsage: options.tokenUsage, + costUsd: options.costUsd, + durationMs: options.durationMs, + startTime: options.startTime ?? computed.startTime, + endTime: options.endTime ?? computed.endTime, + metadata: { + ...(options.provider ? { provider: options.provider } : {}), + ...(options.target ? { target: options.target } : {}), + ...(options.testId ? { eval_case_id: options.testId } : {}), + ...(options.conversationId ? { provider_session_id: options.conversationId } : {}), + ...options.metadata, + }, + }; +} + +/** + * Return a copy of an existing trace with an appended structured error event. + */ +export function appendErrorEventToTrace( + trace: Trace, + error: TraceError | string, + metadata?: Readonly>, +): Trace { + return { + ...trace, + errorCount: trace.errorCount + 1, + events: [ + ...trace.events, + { + eventId: `error-${trace.events.length}`, + ordinal: trace.events.length, + type: 'error', + timestamp: trace.endTime, + error: toTraceError(error), + metadata, + }, + ], + }; +} + /** * Combined result of trace computation + execution metrics merge. * Returned by computeTraceSummaryWithMetrics(). @@ -793,9 +1133,7 @@ export function computeTraceSummary(messages: readonly MessageLike[]): TraceComp * source also carries explicit `branch.includedEventIds`, honor it here so * branchable transcripts cannot accidentally grade omitted alternatives. */ -export function getSelectedTrajectoryEvents( - trajectory: NormalizedTrajectory, -): readonly NormalizedTraceEvent[] { +export function getSelectedTrajectoryEvents(trajectory: TraceArtifact): readonly TraceEvent[] { if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) { return trajectory.events; } @@ -809,16 +1147,14 @@ export function getSelectedTrajectoryEvents( * * This is the canonical bridge from the high-fidelity trajectory contract to the * backward-compatible summary/read model. Keep the projection one-way: importers - * and replay should preserve NormalizedTrajectory, while existing result readers + * and replay should preserve TraceArtifact or Trace, while existing result readers * can continue consuming the derived TraceSummary shape unchanged. * * The summary keeps the current lightweight contract: eventCount is the number * of tool-call events, toolCalls is counted by tool name, toolDurations carries * per-tool milliseconds when present, and llmCallCount counts model turns. */ -export function computeTraceSummaryFromTrajectory( - trajectory: NormalizedTrajectory, -): TraceComputeResult { +export function computeTraceSummaryFromTrajectory(trajectory: TraceArtifact): TraceComputeResult { const selectedEvents = getSelectedTrajectoryEvents(trajectory); const hasModelTurnEvents = selectedEvents.some((event) => event.type === 'model_turn'); const toolCallCounts: Record = {}; @@ -896,7 +1232,7 @@ function deriveEventEnd(start: Date | undefined, durationMs: number | undefined) return new Date(start.getTime() + durationMs); } -function isErrorToolEvent(event: NormalizedTraceEvent): boolean { +function isErrorToolEvent(event: TraceEvent): boolean { return Boolean( event.tool?.error || event.tool?.status === 'error' || diff --git a/packages/core/src/import/index.ts b/packages/core/src/import/index.ts index 4170f4128..5f585e9a9 100644 --- a/packages/core/src/import/index.ts +++ b/packages/core/src/import/index.ts @@ -15,6 +15,8 @@ export { groupTranscriptJsonLines, readTranscriptFile, readTranscriptJsonl, + traceFromTranscriptJsonLines, + traceToTranscriptJsonLines, toTranscriptJsonLines, type TranscriptEntry, type TranscriptJsonLine, diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts index 4a69f5946..2a451c24f 100644 --- a/packages/core/src/import/types.ts +++ b/packages/core/src/import/types.ts @@ -18,6 +18,7 @@ import { readFile } from 'node:fs/promises'; import { toCamelCaseDeep, toSnakeCaseDeep } from '../evaluation/case-conversion.js'; import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js'; +import { buildTraceFromMessages, type Trace } from '../evaluation/trace.js'; /** * A parsed transcript: ordered messages plus session metadata (internal camelCase). @@ -148,6 +149,64 @@ export function toTranscriptJsonLines( })); } +/** + * Convert a canonical evaluation trace to transcript JSONL rows. + */ +export function traceToTranscriptJsonLines( + trace: Trace, + options?: { testId?: string; target?: string }, +): TranscriptJsonLine[] { + const provider = + (typeof trace.metadata?.provider === 'string' ? trace.metadata.provider : undefined) ?? + options?.target ?? + 'agentv'; + const sessionId = + (typeof trace.metadata?.provider_session_id === 'string' + ? trace.metadata.provider_session_id + : undefined) ?? + (typeof trace.metadata?.eval_case_id === 'string' ? trace.metadata.eval_case_id : undefined) ?? + options?.testId ?? + 'trace'; + + return toTranscriptJsonLines( + { + messages: [...trace.messages], + source: { + provider, + sessionId, + startedAt: trace.startTime, + }, + tokenUsage: trace.tokenUsage, + durationMs: trace.durationMs, + costUsd: trace.costUsd, + }, + options, + ); +} + +/** + * Reconstruct a canonical trace/messages representation from transcript JSONL + * rows. Transcript-aware graders can use this for offline replay parity. + */ +export function traceFromTranscriptJsonLines(lines: readonly TranscriptJsonLine[]): Trace { + const [entry] = groupTranscriptJsonLines(lines); + if (!entry) { + return buildTraceFromMessages(); + } + + return buildTraceFromMessages({ + output: entry.messages, + tokenUsage: entry.tokenUsage, + durationMs: entry.durationMs, + costUsd: entry.costUsd ?? undefined, + startTime: entry.source.startedAt, + provider: entry.source.provider, + target: entry.target, + testId: entry.testId, + conversationId: entry.source.sessionId, + }); +} + function buildReplayMessage(line: TranscriptJsonLine): Message { const camelCased = toCamelCaseDeep(line) as { role: string; diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts index 5ee52a54c..f0abe2c44 100644 --- a/packages/eval/src/index.ts +++ b/packages/eval/src/index.ts @@ -9,10 +9,9 @@ * import { defineAssertion } from '@agentv/eval'; * * export default defineAssertion(({ output, criteria }) => { - * const text = output?.map(m => String(m.content ?? '')).join(' ') ?? ''; * return { - * pass: text.includes('hello'), - * assertions: [{ text: 'Checks greeting', passed: text.includes('hello') }], + * pass: output.includes('hello'), + * assertions: [{ text: 'Checks greeting', passed: output.includes('hello') }], * }; * })); * ``` @@ -23,7 +22,6 @@ * import { defineCodeGrader } from '@agentv/eval'; * * export default defineCodeGrader(({ trace, output }) => { - * const text = output?.map(m => String(m.content ?? '')).join(' ') ?? ''; * return { * score: trace?.eventCount <= 5 ? 1.0 : 0.5, * assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }], @@ -43,7 +41,25 @@ export { NORMALIZED_TRACE_EVENT_TYPES, NORMALIZED_TRACE_SOURCE_KINDS, NORMALIZED_TRAJECTORY_SCHEMA_VERSION, + TRACE_REDACTION_LEVELS, + TRACE_SCHEMA_VERSION, + TRACE_SOURCE_KINDS, + TRACE_EVENT_TYPES, + TRACE_TOOL_STATUSES, TraceSummarySchema, + TraceSchema, + TraceArtifactSchema, + TraceRawEvidenceSchema, + TraceRedactionStateSchema, + TraceBranchSchema, + TraceErrorSchema, + TraceEventSchema, + TraceMessageSchema, + TraceModelSchema, + TraceSessionSchema, + TraceSourceRefSchema, + TraceSourceSchema, + TraceToolSchema, NormalizedRawEvidenceSchema, NormalizedRedactionStateSchema, NormalizedTraceBranchSchema, @@ -66,6 +82,18 @@ export { ContentSchema, type CodeGraderInput, type CodeGraderResult, + type TraceArtifact, + type TraceRawEvidence, + type TraceRedactionState, + type TraceBranch, + type TraceError, + type TraceEvent, + type TraceMessage, + type TraceModel, + type TraceSession, + type TraceSource, + type TraceSourceRef, + type TraceTool, type NormalizedRawEvidence, type NormalizedRedactionState, type NormalizedTraceBranch, @@ -79,6 +107,7 @@ export { type NormalizedTraceTool, type NormalizedTrajectory, type TraceSummary, + type Trace, type Message, type ToolCall, type TokenUsage, diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index ea454ba67..3200cdc38 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -35,9 +35,8 @@ export const TokenUsageSchema = z.object({ /** * Derived trace summary schema (camelCase for TypeScript ergonomics). * - * This is a compatibility/read model for existing code graders and result - * artifacts. Full trace state should use NormalizedTrajectory and project into - * this shape only at result or grader-compatibility boundaries. + * This is a compact read model for metric-style graders. Full transcript/tool + * evidence lives in the canonical `Trace` under `messages` and `events`. */ export const TraceSummarySchema = z.object({ eventCount: z.number(), @@ -64,12 +63,20 @@ export const NORMALIZED_TRACE_EVENT_TYPES = [ 'model_turn', 'tool_call', 'tool_result', + 'final_response', + 'error', ] as const; export const NORMALIZED_TOOL_STATUSES = ['ok', 'error', 'timeout', 'cancelled', 'unknown'] as const; export const NORMALIZED_REDACTION_LEVELS = ['none', 'partial', 'full'] as const; +export const TRACE_SCHEMA_VERSION = NORMALIZED_TRAJECTORY_SCHEMA_VERSION; +export const TRACE_SOURCE_KINDS = NORMALIZED_TRACE_SOURCE_KINDS; +export const TRACE_EVENT_TYPES = NORMALIZED_TRACE_EVENT_TYPES; +export const TRACE_TOOL_STATUSES = NORMALIZED_TOOL_STATUSES; +export const TRACE_REDACTION_LEVELS = NORMALIZED_REDACTION_LEVELS; + const MetadataSchema = z.record(z.string(), z.unknown()); export const NormalizedRedactionStateSchema = z.object({ @@ -173,6 +180,7 @@ export const NormalizedTraceEventSchema = z.object({ message: NormalizedTraceMessageSchema.optional(), model: NormalizedTraceModelSchema.optional(), tool: NormalizedTraceToolSchema.optional(), + error: NormalizedTraceErrorSchema.optional(), sourceRef: NormalizedTraceSourceRefSchema.optional(), rawEvidence: z.array(NormalizedRawEvidenceSchema).optional(), redaction: NormalizedRedactionStateSchema.optional(), @@ -200,6 +208,19 @@ export const NormalizedTrajectorySchema = z.object({ metadata: MetadataSchema.optional(), }); +export const TraceRedactionStateSchema = NormalizedRedactionStateSchema; +export const TraceErrorSchema = NormalizedTraceErrorSchema; +export const TraceSourceSchema = NormalizedTraceSourceSchema; +export const TraceSessionSchema = NormalizedTraceSessionSchema; +export const TraceBranchSchema = NormalizedTraceBranchSchema; +export const TraceSourceRefSchema = NormalizedTraceSourceRefSchema; +export const TraceRawEvidenceSchema = NormalizedRawEvidenceSchema; +export const TraceMessageSchema = NormalizedTraceMessageSchema; +export const TraceModelSchema = NormalizedTraceModelSchema; +export const TraceToolSchema = NormalizedTraceToolSchema; +export const TraceEventSchema = NormalizedTraceEventSchema; +export const TraceArtifactSchema = NormalizedTrajectorySchema; + /** * Tool call schema. */ @@ -269,22 +290,45 @@ export const MessageSchema = z.object({ metadata: z.record(z.unknown()).optional(), }); +/** + * Canonical evaluation trace exposed to custom graders. + * + * Top-level summary fields (`eventCount`, `toolCalls`, `errorCount`) remain + * available for existing metric graders; full transcript/tool evidence is under + * `messages` and structured execution events under `events`. + */ +export const TraceSchema = TraceSummarySchema.extend({ + schemaVersion: z.literal(TRACE_SCHEMA_VERSION), + messages: z.array(MessageSchema), + events: z.array(TraceEventSchema), + tokenUsage: TokenUsageSchema.optional(), + costUsd: z.number().optional(), + durationMs: z.number().optional(), + startTime: z.string().optional(), + endTime: z.string().optional(), + metadata: MetadataSchema.optional(), +}); + /** * Code grader input schema (camelCase, converted from snake_case wire format). * - * Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`. - * To extract plain text from message content, use `getTextContent()` from `@agentv/core`. + * `output` is the final answer/scored result only. Transcript-aware graders + * should inspect `messages`, `trace.messages`, or `trace.events`. */ export const CodeGraderInputSchema = z.object({ criteria: z.string(), expectedOutput: z.array(MessageSchema), - output: z.array(MessageSchema).nullable().optional(), + output: z.string().nullable().optional(), + /** Deprecated migration alias; same value as output for text agents. */ + answer: z.string().optional(), + messages: z.array(MessageSchema).optional().default([]), /** Path to a temp file containing the output JSON (used for large payloads). */ outputPath: z.string().optional(), inputFiles: z.array(z.string()), input: z.array(MessageSchema), metadata: z.record(z.unknown()).nullable().optional(), - trace: TraceSummarySchema.nullable().optional(), + trace: TraceSchema.nullable().optional(), + traceSummary: TraceSummarySchema.nullable().optional(), tokenUsage: TokenUsageSchema.nullable().optional(), costUsd: z.number().nullable().optional(), durationMs: z.number().nullable().optional(), @@ -321,18 +365,43 @@ export type CodeGraderInput = z.infer; export type CodeGraderResult = z.infer; export type TraceSummary = z.infer; -export type NormalizedTrajectory = z.infer; -export type NormalizedTraceSource = z.infer; -export type NormalizedTraceSession = z.infer; -export type NormalizedTraceBranch = z.infer; -export type NormalizedTraceEvent = z.infer; -export type NormalizedTraceMessage = z.infer; -export type NormalizedTraceModel = z.infer; -export type NormalizedTraceTool = z.infer; -export type NormalizedTraceError = z.infer; -export type NormalizedTraceSourceRef = z.infer; -export type NormalizedRawEvidence = z.infer; -export type NormalizedRedactionState = z.infer; +export type Trace = z.infer; +export type TraceArtifact = z.infer; +export type TraceSource = z.infer; +export type TraceSession = z.infer; +export type TraceBranch = z.infer; +export type TraceEvent = z.infer; +export type TraceMessage = z.infer; +export type TraceModel = z.infer; +export type TraceTool = z.infer; +export type TraceError = z.infer; +export type TraceSourceRef = z.infer; +export type TraceRawEvidence = z.infer; +export type TraceRedactionState = z.infer; +/** @deprecated Use TraceArtifact for legacy import/replay artifacts or Trace for evaluation results. */ +export type NormalizedTrajectory = TraceArtifact; +/** @deprecated Use TraceSource. */ +export type NormalizedTraceSource = TraceSource; +/** @deprecated Use TraceSession. */ +export type NormalizedTraceSession = TraceSession; +/** @deprecated Use TraceBranch. */ +export type NormalizedTraceBranch = TraceBranch; +/** @deprecated Use TraceEvent. */ +export type NormalizedTraceEvent = TraceEvent; +/** @deprecated Use TraceMessage. */ +export type NormalizedTraceMessage = TraceMessage; +/** @deprecated Use TraceModel. */ +export type NormalizedTraceModel = TraceModel; +/** @deprecated Use TraceTool. */ +export type NormalizedTraceTool = TraceTool; +/** @deprecated Use TraceError. */ +export type NormalizedTraceError = TraceError; +/** @deprecated Use TraceSourceRef. */ +export type NormalizedTraceSourceRef = TraceSourceRef; +/** @deprecated Use TraceRawEvidence. */ +export type NormalizedRawEvidence = TraceRawEvidence; +/** @deprecated Use TraceRedactionState. */ +export type NormalizedRedactionState = TraceRedactionState; export type Message = z.infer; export type ToolCall = z.infer; export type TokenUsage = z.infer; From 31e25b0ad6d8f3e5b79b3d5680d00b3283ba1d68 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 07:25:03 +0200 Subject: [PATCH 2/7] feat(evaluation): score final output with full trace --- apps/cli/src/commands/eval/commands/assert.ts | 17 ++- apps/cli/src/commands/eval/run-eval.ts | 50 +++---- apps/cli/src/commands/inspect/score.ts | 23 ++- apps/cli/src/commands/inspect/show.ts | 4 +- apps/cli/src/commands/inspect/utils.ts | 3 +- apps/cli/src/commands/pipeline/grade.ts | 14 +- apps/cli/src/commands/results/show.ts | 4 +- .../src/evaluation/graders/code-grader.ts | 34 ++++- packages/core/src/evaluation/graders/types.ts | 6 +- packages/core/src/evaluation/orchestrator.ts | 131 +++++++++++++++--- packages/core/src/evaluation/types.ts | 10 +- .../core/src/observability/otel-exporter.ts | 20 ++- 12 files changed, 232 insertions(+), 84 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/assert.ts b/apps/cli/src/commands/eval/commands/assert.ts index 519fbfc84..c50e20e5c 100644 --- a/apps/cli/src/commands/eval/commands/assert.ts +++ b/apps/cli/src/commands/eval/commands/assert.ts @@ -3,7 +3,7 @@ import path from 'node:path'; import { command, option, optional, positional, string } from 'cmd-ts'; import fg from 'fast-glob'; -import { executeScript } from '@agentv/core'; +import { buildTraceFromMessages, executeScript } from '@agentv/core'; export const evalAssertCommand = command({ name: 'assert', @@ -64,17 +64,26 @@ export const evalAssertCommand = command({ // Build payload matching CodeGrader's expected format (snake_case). // Include all fields that defineCodeGrader validates as required. + const messages = [{ role: 'assistant' as const, content: resolvedOutput }]; + const inputMessages = [{ role: 'user' as const, content: resolvedInput }]; + const trace = buildTraceFromMessages({ + input: inputMessages, + output: messages, + finalOutput: resolvedOutput, + }); const payload = JSON.stringify( { - output: [{ role: 'assistant', content: resolvedOutput }], - input: [{ role: 'user', content: resolvedInput }], + output: resolvedOutput, + answer: resolvedOutput, + messages, + input: inputMessages, question: resolvedInput, criteria: '', expected_output: [], reference_answer: '', input_files: [], - trace: null, + trace, token_usage: null, cost_usd: null, duration_ms: null, diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a1d31c6bb..623045d7c 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -15,6 +15,7 @@ import { ResponseCache, RunBudgetTracker, type TrialsConfig, + buildTraceFromMessages, runEvaluation as defaultRunEvaluation, deriveCategory, ensureVSCodeSubagents, @@ -313,32 +314,15 @@ function normalizeOutputMessages(cliValue: string | undefined): number | 'all' { } /** - * Trim output messages for results JSONL. - * Each message is stripped to { role, content } only. - * - * - `1` (default): last assistant message only (legacy behavior) - * - `N`: last N messages (any role) - * - `'all'`: all messages + * Deprecated compatibility hook for the old output-as-messages JSONL surface. + * Result `output` is now the final answer string; full transcript data stays + * under `trace.messages` and is intentionally not trimmed here. */ export function trimOutputMessages( output: EvaluationResult['output'], - outputMessages: number | 'all', + _outputMessages: number | 'all', ): EvaluationResult['output'] { - const messages = output ?? []; - - if (outputMessages === 'all') { - return messages.map((m) => ({ role: m.role, content: m.content })); - } - - if (outputMessages === 1) { - // Legacy behavior: last assistant message only - const lastAssistant = messages.filter((m) => m.role === 'assistant').at(-1); - return lastAssistant ? [{ role: lastAssistant.role, content: lastAssistant.content }] : []; - } - - // Last N messages (any role), trimmed to { role, content } - const sliced = messages.slice(-outputMessages); - return sliced.map((m) => ({ role: m.role, content: m.content })); + return output; } function normalizeOptions( @@ -1576,7 +1560,16 @@ export async function runEvalCommand( testId: testCase.id, score: 0, assertions: [], - output: [], + output: budgetMsg, + trace: buildTraceFromMessages({ + input: testCase.input as EvaluationResult['input'], + output: [{ role: 'assistant' as const, content: budgetMsg }], + finalOutput: budgetMsg, + target: selection.targetName, + testId: testCase.id, + conversationId: testCase.conversation_id, + error: budgetMsg, + }), error: budgetMsg, budgetExceeded: true, executionStatus: 'execution_error' as const, @@ -1680,7 +1673,16 @@ export async function runEvalCommand( testId: testCase.id, score: 0, assertions: [], - output: [], + output: message, + trace: buildTraceFromMessages({ + input: testCase.input as EvaluationResult['input'], + output: [{ role: 'assistant' as const, content: message }], + finalOutput: message, + target: selection.targetName, + testId: testCase.id, + conversationId: testCase.conversation_id, + error: message, + }), scores: [], error: message, executionStatus: 'execution_error' as const, diff --git a/apps/cli/src/commands/inspect/score.ts b/apps/cli/src/commands/inspect/score.ts index 3abdc9ca8..75244e827 100644 --- a/apps/cli/src/commands/inspect/score.ts +++ b/apps/cli/src/commands/inspect/score.ts @@ -9,6 +9,7 @@ import { type Provider, type ProviderRequest, type ProviderResponse, + buildTraceFromMessages, createBuiltinRegistry, toCamelCaseDeep, } from '@agentv/core'; @@ -205,9 +206,25 @@ async function runScore( for (const raw of results) { if (testIdFilter && raw.test_id !== testIdFilter) continue; - const trace = toTraceSummary(raw); const candidate = extractCandidate(raw); - const output = raw.output as readonly Message[] | undefined; + const output = + (raw.trace as { messages?: unknown } | undefined)?.messages ?? + (Array.isArray(raw.output) ? raw.output : undefined); + const outputMessages = Array.isArray(output) + ? (toCamelCaseDeep(output) as readonly Message[]) + : undefined; + const trace = + raw.trace && + Array.isArray((raw.trace as { messages?: unknown }).messages) && + Array.isArray((raw.trace as { events?: unknown }).events) + ? (toCamelCaseDeep(raw.trace) as EvaluationContext['trace']) + : buildTraceFromMessages({ + output: outputMessages, + finalOutput: candidate, + summary: toTraceSummary(raw), + target: raw.target, + testId: raw.test_id, + }); const evalContext: EvaluationContext = { evalCase: buildTestCase(raw), @@ -217,7 +234,7 @@ async function runScore( attempt: 1, promptInputs: { question: '' }, now: new Date(), - output: Array.isArray(output) ? output : undefined, + output: outputMessages, trace, tokenUsage: raw.token_usage ? (toCamelCaseDeep(raw.token_usage) as EvaluationContext['tokenUsage']) diff --git a/apps/cli/src/commands/inspect/show.ts b/apps/cli/src/commands/inspect/show.ts index c738a4aad..3925ee172 100644 --- a/apps/cli/src/commands/inspect/show.ts +++ b/apps/cli/src/commands/inspect/show.ts @@ -82,7 +82,9 @@ interface RawToolCall { * Shows a hierarchical trace: LLM calls → tool calls. */ function renderTree(result: RawResult): string { - const messages = result.output as RawMessage[] | undefined; + const messages = + (result.trace as { messages?: RawMessage[] } | undefined)?.messages ?? + (Array.isArray(result.output) ? (result.output as RawMessage[]) : undefined); const spans = getTraceSpans(result); if (!messages || messages.length === 0) { diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 1b9cbd30b..25399ed20 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -1,7 +1,7 @@ import { readFileSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import type { EvaluationResult, TraceSummary } from '@agentv/core'; -import { DEFAULT_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; +import { DEFAULT_THRESHOLD, toCamelCaseDeep, toSnakeCaseDeep } from '@agentv/core'; import { RESULT_INDEX_FILENAME, RESULT_RUNS_DIRNAME, @@ -183,6 +183,7 @@ function toRawResult(result: EvaluationResult): RawResult { end_time: result.endTime, input: result.input, output: result.output, + trace: toSnakeCaseDeep(result.trace) as RawTraceSummary, file_changes: result.fileChanges, }; } diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index d262c0aa0..906987245 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -17,6 +17,7 @@ import { join } from 'node:path'; import { type AssertionResult, + buildTraceFromMessages, executeScript, runContainsAllAssertion, runContainsAnyAssertion, @@ -107,13 +108,22 @@ export async function runCodeGraders( const { testId, resultsDir, responseText, inputData } = task; const graderName = graderConfig.name as string; const inputText = extractInputText(inputData.input); + const messages = [{ role: 'assistant' as const, content: responseText }]; + const trace = buildTraceFromMessages({ + input: inputData.input, + output: messages, + finalOutput: responseText, + testId, + }); const payload = JSON.stringify({ - output: [{ role: 'assistant', content: responseText }], + output: responseText, + answer: responseText, + messages, input: inputData.input, criteria: '', expected_output: [], input_files: inputData.input_files ?? [], - trace: null, + trace, token_usage: null, cost_usd: null, duration_ms: null, diff --git a/apps/cli/src/commands/results/show.ts b/apps/cli/src/commands/results/show.ts index 52b8b9cfa..8c1926d8c 100644 --- a/apps/cli/src/commands/results/show.ts +++ b/apps/cli/src/commands/results/show.ts @@ -35,9 +35,7 @@ function formatInput(result: EvaluationResult): string { function formatOutput(result: EvaluationResult): string { if (!result.output || result.output.length === 0) return '(no output)'; - return result.output - .map((msg) => String((msg as unknown as Record).content ?? '')) - .join('\n'); + return result.output; } // ── Formatting ─────────────────────────────────────────────────────────── diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 3ec89061b..e58c1b438 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -134,17 +134,19 @@ export class CodeGrader implements Grader { return imageTmpDir; }; - // Materialize multimodal content (data URIs → temp files, source → path) - const materializedOutput = await materializeContentForGrader( - context.output as readonly Record[] | undefined, + const transcriptMessages = context.trace?.messages ?? context.output ?? []; + + // Materialize transcript multimodal content (data URIs → temp files, source → path) + const materializedMessages = await materializeContentForGrader( + transcriptMessages as unknown as readonly Record[] | undefined, getImageDir, ); - // Determine whether to use file-backed output for large payloads - let outputForPayload: readonly Record[] | null = materializedOutput; + // Determine whether to use file-backed output for large final answers + let outputForPayload: string | null = context.candidate; let outputPath: string | undefined; - if (outputForPayload) { + if (outputForPayload !== null) { const serialized = JSON.stringify(outputForPayload); if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) { const tmpDir = await mkdtemp(join(tmpdir(), 'agentv-grader-')); @@ -154,6 +156,13 @@ export class CodeGrader implements Grader { } } + const traceForPayload = context.trace + ? { + ...context.trace, + messages: materializedMessages ?? context.trace.messages, + } + : null; + // Build payload (camelCase internally, converted to snake_case for graders) const payload = { criteria: context.evalCase.criteria, @@ -162,6 +171,8 @@ export class CodeGrader implements Grader { getImageDir, ), output: outputForPayload, + answer: context.candidate, + messages: materializedMessages ?? [], outputPath, inputFiles: context.evalCase.file_paths, input: await materializeContentForGrader( @@ -169,7 +180,16 @@ export class CodeGrader implements Grader { getImageDir, ), metadata: context.evalCase.metadata ?? null, - trace: context.trace ?? null, + trace: traceForPayload, + traceSummary: context.trace + ? { + eventCount: context.trace.eventCount, + toolCalls: context.trace.toolCalls, + errorCount: context.trace.errorCount, + toolDurations: context.trace.toolDurations, + llmCallCount: context.trace.llmCallCount, + } + : null, tokenUsage: context.tokenUsage ?? null, costUsd: context.costUsd ?? null, durationMs: context.durationMs ?? null, diff --git a/packages/core/src/evaluation/graders/types.ts b/packages/core/src/evaluation/graders/types.ts index 1d548e5f9..c1376e5e8 100644 --- a/packages/core/src/evaluation/graders/types.ts +++ b/packages/core/src/evaluation/graders/types.ts @@ -1,6 +1,6 @@ import type { ResolvedTarget } from '../providers/targets.js'; import type { ChatPrompt, Message, Provider } from '../providers/types.js'; -import type { TokenUsage, TraceSummary } from '../trace.js'; +import type { TokenUsage, Trace } from '../trace.js'; import type { DependencyResult, DockerWorkspaceConfig, @@ -37,8 +37,8 @@ export interface EvaluationContext { readonly evaluator?: GraderConfig; /** Output messages from agent execution (primary source for tool trajectory) */ readonly output?: readonly Message[]; - /** Lightweight summary of trace events (if available) */ - readonly trace?: TraceSummary; + /** Canonical execution trace with messages, events, metrics, and provenance. */ + readonly trace?: Trace; /** Token usage from provider execution (promoted from TraceSummary) */ readonly tokenUsage?: TokenUsage; /** Total cost in USD (from provider) */ diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index e3da97d6e..54e38fcd9 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -46,7 +46,10 @@ import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './re import type { RunBudgetTracker } from './run-budget-tracker.js'; import { type TokenUsage, + type Trace, type TraceSummary, + appendErrorEventToTrace, + buildTraceFromMessages, computeTraceSummary, mergeExecutionMetrics, } from './trace.js'; @@ -1130,10 +1133,9 @@ export async function runEvaluation( // Helper: build a DependencyResult from a completed EvaluationResult function toDependencyResult(r: EvaluationResult): DependencyResult { - const outputText = extractLastAssistantContent(r.output); return { score: r.score, - output: outputText, + output: r.output, workspace_path: r.workspacePath, details: r.scores ? (Object.fromEntries( @@ -1196,6 +1198,7 @@ export async function runEvaluation( // eval files/targets in the current CLI invocation, so queued cases stop once // cumulative spend reaches the cap while already-running cases are allowed to finish. if (runBudgetTracker?.isExceeded()) { + const errorMessage = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`; const budgetResult: EvaluationResult = { timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, @@ -1203,15 +1206,24 @@ export async function runEvaluation( category: evalCase.category, score: 0, assertions: [], - output: [], + output: errorMessage, + trace: buildTraceFromMessages({ + input: evalCase.input as readonly Message[], + output: [{ role: 'assistant' as const, content: errorMessage }], + finalOutput: errorMessage, + target: target.name, + testId: evalCase.id, + conversationId: evalCase.conversation_id, + error: errorMessage, + }), target: target.name, - error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`, + error: errorMessage, budgetExceeded: true, executionStatus: 'execution_error', failureStage: 'setup', failureReasonCode: 'budget_exceeded', executionError: { - message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`, + message: errorMessage, stage: 'setup', }, }; @@ -1235,6 +1247,7 @@ export async function runEvaluation( // Check suite-level budget before dispatching if (budgetUsd !== undefined && budgetExhausted) { + const errorMessage = `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`; const budgetResult: EvaluationResult = { timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, @@ -1242,15 +1255,24 @@ export async function runEvaluation( category: evalCase.category, score: 0, assertions: [], - output: [], + output: errorMessage, + trace: buildTraceFromMessages({ + input: evalCase.input as readonly Message[], + output: [{ role: 'assistant' as const, content: errorMessage }], + finalOutput: errorMessage, + target: target.name, + testId: evalCase.id, + conversationId: evalCase.conversation_id, + error: errorMessage, + }), target: target.name, - error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`, + error: errorMessage, budgetExceeded: true, executionStatus: 'execution_error', failureStage: 'setup', failureReasonCode: 'budget_exceeded', executionError: { - message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`, + message: errorMessage, stage: 'setup', }, }; @@ -1282,7 +1304,16 @@ export async function runEvaluation( category: evalCase.category, score: 0, assertions: [], - output: [], + output: errorMsg, + trace: buildTraceFromMessages({ + input: evalCase.input as readonly Message[], + output: [{ role: 'assistant' as const, content: errorMsg }], + finalOutput: errorMsg, + target: target.name, + testId: evalCase.id, + conversationId: evalCase.conversation_id, + error: errorMsg, + }), target: target.name, error: errorMsg, executionStatus: 'execution_error', @@ -1455,7 +1486,16 @@ export async function runEvaluation( category: evalCase.category, score: 0, assertions: [], - output: [], + output: errorMsg, + trace: buildTraceFromMessages({ + input: evalCase.input as readonly Message[], + output: [{ role: 'assistant' as const, content: errorMsg }], + finalOutput: errorMsg, + target: target.name, + testId: evalCase.id, + conversationId: evalCase.conversation_id, + error: errorMsg, + }), target: target.name, error: errorMsg, executionStatus: 'execution_error', @@ -1756,6 +1796,10 @@ async function runBatchEvaluation(options: { if (providerError) { result = { ...result, + trace: appendErrorEventToTrace(result.trace, providerError, { + failure_stage: 'agent', + failure_reason_code: 'provider_error', + }), error: providerError, executionStatus: 'execution_error' as const, failureStage: 'agent' as const, @@ -2495,6 +2539,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise [...s.assertions]); - const totalDurationMs = Date.now() - caseStartMs; return { timestamp: nowFn().toISOString(), @@ -3531,7 +3610,8 @@ async function runConversationMode(options: { score: finalScore, assertions: flatAssertions, target: target.name, - output: outputMessages, + output: finalOutput, + trace, scores: allResultScores, executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD), input: evalCase.input.map((m) => ({ @@ -3732,6 +3812,16 @@ function buildErrorResult( } : undefined; const input = buildResultInput(promptInputs); + const output = `Error occurred: ${message}`; + const trace = buildTraceFromMessages({ + input, + output: [{ role: 'assistant' as const, content: output }], + finalOutput: output, + target: targetName, + testId: evalCase.id, + conversationId: evalCase.conversation_id, + error: message, + }); return { timestamp: timestamp.toISOString(), @@ -3744,7 +3834,8 @@ function buildErrorResult( target: targetName, requests, input, - output: [{ role: 'assistant' as const, content: `Error occurred: ${message}` }], + output, + trace, error: message, executionStatus: 'execution_error', failureStage, diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 7e764ff8b..cb3735296 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1,4 +1,4 @@ -import type { TokenUsage, ToolTrajectoryGraderConfig, TraceSummary } from './trace.js'; +import type { TokenUsage, ToolTrajectoryGraderConfig, Trace } from './trace.js'; /** A single assertion verdict with optional evidence. */ export interface AssertionEntry { @@ -1194,14 +1194,14 @@ export interface EvaluationResult { }; readonly scores?: readonly GraderResult[]; readonly error?: string; - /** Lightweight summary of the execution trace (always included when available) */ - readonly trace?: TraceSummary; + /** Canonical execution trace: messages, events, metrics, and provider provenance. */ + readonly trace: Trace; /** Path to the temporary workspace directory (included on failure for debugging) */ readonly workspacePath?: string; /** Input messages sent to the agent. Always Message[] for consistent shape with output. */ readonly input?: readonly import('./providers/types.js').Message[]; - /** Output messages from agent execution. Always present — at minimum contains the final assistant message. */ - readonly output: readonly import('./providers/types.js').Message[]; + /** Final answer / scored result only. Full transcript lives in trace.messages/events. */ + readonly output: string; /** Captured output from workspace before_all script */ readonly beforeAllOutput?: string; /** Captured output from workspace before_each script */ diff --git a/packages/core/src/observability/otel-exporter.ts b/packages/core/src/observability/otel-exporter.ts index 73f1a98b1..d3c400a9a 100644 --- a/packages/core/src/observability/otel-exporter.ts +++ b/packages/core/src/observability/otel-exporter.ts @@ -185,10 +185,7 @@ export class OtelTraceExporter { if (result.suite) rootSpan.setAttribute('agentv.suite', result.suite); rootSpan.setAttribute('agentv.score', result.score); if (captureContent && result.output.length > 0) { - const lastMsg = result.output[result.output.length - 1]; - const text = - typeof lastMsg.content === 'string' ? lastMsg.content : JSON.stringify(lastMsg.content); - rootSpan.setAttribute('agentv.output_text', text); + rootSpan.setAttribute('agentv.output_text', result.output); } // Flat execution metrics @@ -219,12 +216,13 @@ export class OtelTraceExporter { rootSpan.setAttribute('agentv.trace.llm_call_count', t.llmCallCount); } - // Child spans from output messages (--trace mode) - if (result.output) { + // Child spans from trace messages (--trace mode) + const traceMessages = result.trace.messages; + if (traceMessages.length > 0) { const parentCtx = api.trace.setSpan(api.context.active(), rootSpan); if (this.options.groupTurns) { - const turns = groupMessagesIntoTurns(result.output); + const turns = groupMessagesIntoTurns(traceMessages); if (turns.length > 1) { for (const [i, turn] of turns.entries()) { api.context.with(parentCtx, () => { @@ -244,12 +242,12 @@ export class OtelTraceExporter { }); } } else { - for (const msg of result.output) { + for (const msg of traceMessages) { this.exportMessage(tracer, api, parentCtx, msg, captureContent); } } } else { - for (const msg of result.output) { + for (const msg of traceMessages) { this.exportMessage(tracer, api, parentCtx, msg, captureContent); } } @@ -593,13 +591,13 @@ export class OtelStreamingObserver { } const model = - result.output.find((msg) => msg.role === 'assistant')?.metadata?.model ?? + result.trace.messages.find((msg) => msg.role === 'assistant')?.metadata?.model ?? result.target ?? 'unknown'; this.onLlmCall(String(model), result.tokenUsage); - for (const message of result.output) { + for (const message of result.trace.messages) { for (const toolCall of message.toolCalls ?? []) { this.onToolCall( toolCall.tool, From 082298261d571313433731492ee5d8411e11a5a2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 07:25:25 +0200 Subject: [PATCH 3/7] feat(cli): write answer and transcript artifacts --- apps/cli/src/commands/eval/artifact-writer.ts | 156 +++++++++++------- apps/cli/src/commands/results/manifest.ts | 43 +++-- packages/core/src/import/types.ts | 2 +- 3 files changed, 134 insertions(+), 67 deletions(-) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 8760a3728..8fcb8ed71 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -6,8 +6,12 @@ import { type EvalTest, type EvaluationResult, type GraderResult, + type Message, type TargetDefinition, - toTranscriptJsonLines, + type TraceSummary, + buildTraceFromMessages, + extractLastAssistantContent, + traceToTranscriptJsonLines, } from '@agentv/core'; import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; @@ -195,7 +199,10 @@ export interface IndexArtifactEntry { readonly grading_path: string; readonly timing_path: string; readonly output_path?: string; + readonly answer_path?: string; + readonly transcript_path?: string; readonly input_path?: string; + /** @deprecated Use output_path/answer_path for the final answer. */ readonly response_path?: string; readonly task_dir?: string; readonly eval_path?: string; @@ -245,23 +252,8 @@ function countToolCalls(result: EvaluationResult): { toolCalls: Record; total: number; } { - const toolCalls: Record = {}; - let total = 0; - - const trace = result.trace as - | { steps?: readonly { toolName?: string; type?: string }[] } - | undefined; - - if (trace?.steps) { - for (const step of trace.steps) { - if (step.toolName || step.type === 'tool') { - const name = step.toolName ?? 'unknown'; - toolCalls[name] = (toolCalls[name] ?? 0) + 1; - total += 1; - } - } - } - + const toolCalls = { ...(result.trace?.toolCalls ?? {}) }; + const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0); return { toolCalls, total }; } @@ -365,9 +357,8 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact workspace_changes: parseWorkspaceChanges(result.fileChanges), conversation: result.conversationId ? { - turns: result.trace - ? ((result.trace as { steps?: readonly unknown[] }).steps?.length ?? 0) - : 0, + turns: + result.trace?.messages.filter((message) => message.role === 'assistant').length ?? 0, conversation_id: result.conversationId, } : undefined, @@ -661,7 +652,10 @@ export function buildIndexArtifactEntry( gradingPath: string; timingPath: string; outputPath?: string; + answerPath?: string; + transcriptPath?: string; inputPath?: string; + responsePath?: string; taskBundle?: MaterializedTaskBundlePaths; }, ): IndexArtifactEntry { @@ -689,9 +683,18 @@ export function buildIndexArtifactEntry( output_path: options.outputPath ? toRelativeArtifactPath(options.outputDir, options.outputPath) : undefined, + answer_path: options.answerPath + ? toRelativeArtifactPath(options.outputDir, options.answerPath) + : undefined, + transcript_path: options.transcriptPath + ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) + : undefined, input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : undefined, + response_path: options.responsePath + ? toRelativeArtifactPath(options.outputDir, options.responsePath) + : undefined, ...buildTaskBundleIndexFields(options.outputDir, options.taskBundle), metadata: result.metadata, }; @@ -703,7 +706,8 @@ export function buildResultIndexArtifact( ): ResultIndexArtifact { const artifactSubdir = buildArtifactSubdir(result); const input = extractInput(result); - const hasResponse = Array.isArray(result.output) && result.output.length > 0; + const hasAnswer = result.output.length > 0; + const hasTranscript = result.trace.messages.length > 0 || result.trace.events.length > 0; return { timestamp: result.timestamp, @@ -725,10 +729,12 @@ export function buildResultIndexArtifact( grading_path: path.posix.join(artifactSubdir, 'grading.json'), timing_path: path.posix.join(artifactSubdir, 'timing.json'), input_path: input ? path.posix.join(artifactSubdir, 'input.md') : undefined, - output_path: hasResponse - ? path.posix.join(artifactSubdir, 'outputs', 'response.md') + output_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined, + answer_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined, + transcript_path: hasTranscript + ? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl') : undefined, - response_path: hasResponse + response_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'response.md') : undefined, ...(taskBundle @@ -756,6 +762,16 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr await writeFile(filePath, content, 'utf8'); } +async function writeTranscriptJsonl(filePath: string, result: EvaluationResult): Promise { + const lines = traceToTranscriptJsonLines(result.trace, { + testId: result.testId, + target: result.target, + }); + const content = + lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join('\n')}\n` : ''; + await writeFile(filePath, content, 'utf8'); +} + function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null && !Array.isArray(value); } @@ -852,6 +868,7 @@ type ParsedEvaluationResult = Record & { assertions: EvaluationResult['assertions']; target: string; output: EvaluationResult['output']; + trace: EvaluationResult['trace']; executionStatus: EvaluationResult['executionStatus']; }; @@ -874,7 +891,7 @@ function isAssertionEntry(value: unknown): value is EvaluationResult['assertions ); } -function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] { +function isOutputMessage(value: unknown): value is Message { if (!value || typeof value !== 'object' || Array.isArray(value)) { return false; } @@ -890,12 +907,47 @@ function isExecutionStatus(value: unknown): value is EvaluationResult['execution ); } +function isTraceRecord(value: unknown): value is EvaluationResult['trace'] { + return ( + !!value && + typeof value === 'object' && + !Array.isArray(value) && + Array.isArray((value as { messages?: unknown }).messages) && + Array.isArray((value as { events?: unknown }).events) + ); +} + function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined { if (!value || typeof value !== 'object' || Array.isArray(value)) { return undefined; } const result = value as Record; + const legacyOutputMessages = Array.isArray(result.output) + ? result.output.filter(isOutputMessage) + : undefined; + const output = + typeof result.output === 'string' + ? result.output + : extractLastAssistantContent(legacyOutputMessages); + const legacySummary = + result.trace && typeof result.trace === 'object' && !Array.isArray(result.trace) + ? (result.trace as TraceSummary) + : undefined; + const trace = isTraceRecord(result.trace) + ? result.trace + : buildTraceFromMessages({ + input: Array.isArray(result.input) ? (result.input as EvaluationResult['input']) : [], + output: legacyOutputMessages, + summary: legacySummary, + finalOutput: output, + tokenUsage: result.tokenUsage as EvaluationResult['tokenUsage'], + costUsd: typeof result.costUsd === 'number' ? result.costUsd : undefined, + durationMs: typeof result.durationMs === 'number' ? result.durationMs : undefined, + target: typeof result.target === 'string' ? result.target : undefined, + testId: typeof result.testId === 'string' ? result.testId : undefined, + }); + return { ...result, timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(), @@ -903,7 +955,8 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin score: typeof result.score === 'number' ? result.score : 0, assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [], target: typeof result.target === 'string' ? result.target : 'unknown', - output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [], + output, + trace, executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok', }; } @@ -959,23 +1012,10 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri const lines: string[] = []; for (const result of results) { - const transcriptLines = toTranscriptJsonLines( - { - messages: [...(result.input ?? []), ...result.output], - source: { - provider: result.target, - sessionId: result.conversationId ?? result.testId, - startedAt: result.timestamp, - }, - tokenUsage: result.tokenUsage, - durationMs: result.durationMs, - costUsd: result.costUsd, - }, - { - testId: result.testId, - target: result.target, - }, - ); + const transcriptLines = traceToTranscriptJsonLines(result.trace, { + testId: result.testId, + target: result.target, + }); lines.push(...transcriptLines.map((line) => JSON.stringify(line))); } @@ -1085,14 +1125,16 @@ export async function writePerTestArtifacts( if (input) { await writeFile(path.join(testDir, 'input.md'), input, 'utf8'); } - if (result.output && result.output.length > 0) { + if (result.output.length > 0 || result.trace.messages.length > 0) { const outputsDir = path.join(testDir, 'outputs'); await mkdir(outputsDir, { recursive: true }); - await writeFile( - path.join(outputsDir, 'response.md'), - formatOutputMarkdown(result.output), - 'utf8', - ); + if (result.output.length > 0) { + await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8'); + // Deprecated compatibility alias. New consumers should use answer.md + // for scored output or transcript.jsonl for the full execution record. + await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); + } + await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result); } const taskBundle = await materializeTaskBundleForResult({ @@ -1156,14 +1198,16 @@ export async function writeArtifactsFromResults( await writeFile(path.join(testDir, 'input.md'), input, 'utf8'); } - if (result.output && result.output.length > 0) { + if (result.output.length > 0 || result.trace.messages.length > 0) { const outputsDir = path.join(testDir, 'outputs'); await mkdir(outputsDir, { recursive: true }); - await writeFile( - path.join(outputsDir, 'response.md'), - formatOutputMarkdown(result.output), - 'utf8', - ); + if (result.output.length > 0) { + await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8'); + // Deprecated compatibility alias. New consumers should use answer.md + // for scored output or transcript.jsonl for the full execution record. + await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); + } + await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result); } const taskBundle = await materializeTaskBundleForResult({ diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 1ec215610..99dd71993 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -1,7 +1,12 @@ import { existsSync, readFileSync } from 'node:fs'; import path from 'node:path'; -import type { EvaluationResult } from '@agentv/core'; +import { + type EvaluationResult, + type TranscriptJsonLine, + buildTraceFromMessages, + traceFromTranscriptJsonLines, +} from '@agentv/core'; import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js'; import { @@ -32,6 +37,8 @@ export interface ResultManifestRecord { readonly timing_path?: string; readonly input_path?: string; readonly output_path?: string; + readonly answer_path?: string; + readonly transcript_path?: string; readonly response_path?: string; readonly artifact_dir?: string; readonly task_dir?: string; @@ -106,20 +113,35 @@ function hydrateOutput( baseDir: string, record: ResultManifestRecord, ): EvaluationResult['output'] | undefined { - const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path); + const responseText = readOptionalText( + baseDir, + record.output_path ?? record.answer_path ?? record.response_path, + ); if (!responseText) { return undefined; } - const messages = parseMarkdownMessages(responseText); - if (messages.length > 0) { - return messages.map((message) => ({ - role: message.role as 'assistant' | 'user' | 'system' | 'tool', - content: message.content, - })); + return responseText.trimEnd(); +} + +function hydrateTrace(baseDir: string, record: ResultManifestRecord): EvaluationResult['trace'] { + const transcriptText = readOptionalText(baseDir, record.transcript_path); + if (transcriptText) { + try { + return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); + } catch { + // Fall through to a minimal trace below. + } } - return [{ role: 'assistant', content: responseText.trimEnd() }]; + const output = hydrateOutput(baseDir, record) ?? ''; + return buildTraceFromMessages({ + input: hydrateInput(baseDir, record), + output: output ? [{ role: 'assistant', content: output }] : [], + finalOutput: output, + target: record.target, + testId: record.test_id, + }); } function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): EvaluationResult { @@ -176,7 +198,8 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E durationMs: timing?.duration_ms ?? record.duration_ms, costUsd: record.cost_usd, input: hydrateInput(baseDir, record), - output: hydrateOutput(baseDir, record), + output: hydrateOutput(baseDir, record) ?? '', + trace: hydrateTrace(baseDir, record), metadata: record.metadata, } as EvaluationResult; } diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts index 2a451c24f..baeacb603 100644 --- a/packages/core/src/import/types.ts +++ b/packages/core/src/import/types.ts @@ -18,7 +18,7 @@ import { readFile } from 'node:fs/promises'; import { toCamelCaseDeep, toSnakeCaseDeep } from '../evaluation/case-conversion.js'; import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js'; -import { buildTraceFromMessages, type Trace } from '../evaluation/trace.js'; +import { type Trace, buildTraceFromMessages } from '../evaluation/trace.js'; /** * A parsed transcript: ordered messages plus session metadata (internal camelCase). From 1fdb9a2378069a44f951c7dcdb2d04fefc0d4d58 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 07:30:42 +0200 Subject: [PATCH 4/7] chore: remove repo-local ntm artifacts --- .ntm/palette.md | 17 ----------------- .ntm/personas.toml | 8 -------- 2 files changed, 25 deletions(-) delete mode 100644 .ntm/palette.md delete mode 100644 .ntm/personas.toml diff --git a/.ntm/palette.md b/.ntm/palette.md deleted file mode 100644 index 8b821ce3e..000000000 --- a/.ntm/palette.md +++ /dev/null @@ -1,17 +0,0 @@ -# Project Commands - -## Project -### build | Build Project -bun run build - -### test | Run Tests -bun run test - -### typecheck | Typecheck Workspaces -bun run typecheck - -### lint | Lint and Format Check -bun run lint - -### validate-evals | Validate Example Eval YAML -bun run validate:examples diff --git a/.ntm/personas.toml b/.ntm/personas.toml deleted file mode 100644 index 47fa361ca..000000000 --- a/.ntm/personas.toml +++ /dev/null @@ -1,8 +0,0 @@ -# Project personas for NTM -# Define specialized agent roles and behaviors here. -# Example: -# [[personas]] -# name = "architect" -# agent = "claude" -# description = "High-level design and architecture" -# system_prompt = """You are the architecture specialist...""" From e68544947c1e880249fa528cb1675b3e778bf664 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 07:33:59 +0200 Subject: [PATCH 5/7] chore(targets): remove duplicate pi sdk openai target --- .agentv/targets.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 7586f1a2e..eef024a74 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -93,15 +93,6 @@ targets: thinking: low stream_log: raw - - name: pi-sdk-openai - provider: pi-coding-agent - subprovider: openai - base_url: ${{ OPENAI_ENDPOINT }} - api_key: ${{ OPENAI_API_KEY }} - model: gpt-5.5 - grader_target: openai - thinking: low - stream_log: raw - name: pi-azure provider: pi-cli From 3d9064c4e768781ae72b8ec0983b9460318a0183 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 09:49:47 +0200 Subject: [PATCH 6/7] fix(evaluation): stabilize final output trace contract --- apps/cli/test/commands/eval/aggregate.test.ts | 22 +++- .../commands/eval/artifact-writer.test.ts | 69 ++++++++---- .../commands/eval/output-messages.test.ts | 103 ++---------------- .../results/export-e2e-providers.test.ts | 32 +++--- apps/cli/test/commands/results/export.test.ts | 21 ++-- apps/cli/test/commands/results/report.test.ts | 21 +++- apps/cli/test/commands/results/show.test.ts | 2 +- apps/cli/test/commands/trace/trace.test.ts | 14 ++- apps/cli/test/fixtures/mock-run-evaluation.ts | 45 +++++++- .../core/src/observability/otel-exporter.ts | 6 +- .../evaluation/code-grader-multimodal.test.ts | 10 +- .../test/evaluation/conversation-mode.test.ts | 12 +- .../core/test/evaluation/orchestrator.test.ts | 15 +-- .../core/test/fixtures/test-define-grader.ts | 5 +- .../fixtures/test-grader-with-details.cjs | 13 ++- packages/core/test/fixtures/test-grader.cjs | 17 ++- .../test/observability/otel-exporter.test.ts | 73 +++++++++---- packages/eval/test/define-code-grader.test.ts | 31 ++++-- .../eval/test/define-prompt-template.test.ts | 32 +++--- packages/eval/test/deprecation.test.ts | 8 +- packages/eval/test/file-backed-output.test.ts | 15 +-- 21 files changed, 321 insertions(+), 245 deletions(-) diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts index c79bb7076..91200aa61 100644 --- a/apps/cli/test/commands/eval/aggregate.test.ts +++ b/apps/cli/test/commands/eval/aggregate.test.ts @@ -3,7 +3,7 @@ import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'nod import { tmpdir } from 'node:os'; import path from 'node:path'; -import type { EvaluationResult } from '@agentv/core'; +import { type EvaluationResult, buildTraceFromMessages } from '@agentv/core'; import { toSnakeCaseDeep } from '../../../src/utils/case-conversion.js'; import { @@ -14,16 +14,28 @@ import { } from '../../../src/commands/eval/artifact-writer.js'; function makeResult(overrides: Partial = {}): EvaluationResult { - return { + const result = { timestamp: '2026-04-13T00:00:00.000Z', testId: 'test-1', score: 0.9, assertions: [{ text: 'criterion-1', passed: true }], - output: [{ role: 'assistant' as const, content: 'test answer' }], + output: 'test answer', target: 'test-target', executionStatus: 'ok', ...overrides, } as EvaluationResult; + + return { + ...result, + trace: + result.trace ?? + buildTraceFromMessages({ + output: result.output ? [{ role: 'assistant', content: result.output }] : [], + finalOutput: result.output, + target: result.target, + testId: result.testId, + }), + }; } function writeJsonlIndex(dir: string, results: Partial[]): string { @@ -180,9 +192,7 @@ describe('writePerTestArtifacts', () => { }); it('writes response.md for results with output', async () => { - const results = [ - makeResult({ testId: 'test-1', output: [{ role: 'assistant' as const, content: 'hello' }] }), - ]; + const results = [makeResult({ testId: 'test-1', output: 'hello' })]; await writePerTestArtifacts(results, tmpDir); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 56106fa1a..3bfca87bf 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -6,6 +6,7 @@ import { type EvalTest, type EvaluationResult, type GraderResult, + buildTraceFromMessages, parseYamlValue, } from '@agentv/core'; @@ -26,16 +27,33 @@ import { } from '../../../src/commands/eval/artifact-writer.js'; function makeResult(overrides: Partial = {}): EvaluationResult { - return { + const result = { timestamp: '2026-03-13T00:00:00.000Z', testId: 'test-1', score: 0.9, assertions: [{ text: 'criterion-1', passed: true }], - output: [{ role: 'assistant' as const, content: 'test answer' }], + output: 'test answer', target: 'test-target', executionStatus: 'ok', ...overrides, } as EvaluationResult; + + return { + ...result, + trace: + result.trace ?? + buildTraceFromMessages({ + input: Array.isArray(result.input) ? result.input : [], + output: result.output ? [{ role: 'assistant', content: result.output }] : [], + finalOutput: result.output, + target: result.target, + testId: result.testId, + conversationId: result.conversationId, + tokenUsage: result.tokenUsage, + durationMs: result.durationMs, + costUsd: result.costUsd, + }), + }; } function makeEvaluatorResult(overrides: Partial = {}): GraderResult { @@ -734,6 +752,20 @@ describe('writeArtifactsFromResults', () => { }); it('writes transcript.jsonl as one message object per line', async () => { + const input = [{ role: 'user' as const, content: 'Inspect artifact output' }]; + const output = [ + { + role: 'assistant' as const, + content: 'Reading artifact-writer.ts', + toolCalls: [ + { + tool: 'Read', + input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' }, + output: 'file contents', + }, + ], + }, + ]; const results = [ makeResult({ testId: 'transcript-case', @@ -742,20 +774,19 @@ describe('writeArtifactsFromResults', () => { durationMs: 4200, costUsd: 0.25, tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 }, - input: [{ role: 'user' as const, content: 'Inspect artifact output' }], - output: [ - { - role: 'assistant' as const, - content: 'Reading artifact-writer.ts', - toolCalls: [ - { - tool: 'Read', - input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' }, - output: 'file contents', - }, - ], - }, - ], + input, + output: 'Reading artifact-writer.ts', + trace: buildTraceFromMessages({ + input, + output, + finalOutput: 'Reading artifact-writer.ts', + target: 'codex', + testId: 'transcript-case', + conversationId: 'session-123', + tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 }, + durationMs: 4200, + costUsd: 0.25, + }), }), ]; @@ -779,7 +810,6 @@ describe('writeArtifactsFromResults', () => { source: { provider: 'codex', session_id: 'session-123', - timestamp: '2026-03-13T00:00:00.000Z', }, }, { @@ -801,7 +831,6 @@ describe('writeArtifactsFromResults', () => { source: { provider: 'codex', session_id: 'session-123', - timestamp: '2026-03-13T00:00:00.000Z', }, }, ]); @@ -822,7 +851,7 @@ describe('writeArtifactsFromResults', () => { target: 'baseline', assertions: [{ text: 'baseline-check', passed: true, evidence: 'baseline evidence' }], input: [{ role: 'user' as const, content: 'baseline input' }], - output: [{ role: 'assistant' as const, content: 'baseline output' }], + output: 'baseline output', }), ]; @@ -1136,7 +1165,7 @@ describe('writeArtifacts (from JSONL file)', () => { test_id: 'from-file', score: 0.85, assertions: [{ text: 'pass-1', passed: true }], - output: [{ role: 'assistant', content: 'file answer' }], + output: 'file answer', target: 'default', execution_status: 'ok', duration_ms: 12000, diff --git a/apps/cli/test/commands/eval/output-messages.test.ts b/apps/cli/test/commands/eval/output-messages.test.ts index 5f345615f..a49dd0d6c 100644 --- a/apps/cli/test/commands/eval/output-messages.test.ts +++ b/apps/cli/test/commands/eval/output-messages.test.ts @@ -1,105 +1,16 @@ import { describe, expect, it } from 'bun:test'; -import type { Message } from '@agentv/core'; - import { trimOutputMessages } from '../../../src/commands/eval/run-eval.js'; -const makeMessages = (): readonly Message[] => [ - { role: 'user', content: 'Hello', startTime: '2024-01-01T00:00:00Z', durationMs: 10 }, - { - role: 'assistant', - content: 'Hi there', - toolCalls: [{ id: 'tc1', name: 'read', arguments: '{}' }], - startTime: '2024-01-01T00:00:01Z', - }, - { role: 'tool', content: 'file contents', name: 'read', durationMs: 50 }, - { role: 'assistant', content: 'Done!', startTime: '2024-01-01T00:00:02Z', durationMs: 100 }, -]; - describe('trimOutputMessages', () => { - describe('default (outputMessages = 1)', () => { - it('returns only the last assistant message trimmed to { role, content }', () => { - const result = trimOutputMessages(makeMessages(), 1); - expect(result).toEqual([{ role: 'assistant', content: 'Done!' }]); - }); - - it('returns empty array when no assistant message exists', () => { - const messages: readonly Message[] = [{ role: 'user', content: 'Hello' }]; - const result = trimOutputMessages(messages, 1); - expect(result).toEqual([]); - }); - - it('strips toolCalls, startTime, durationMs from the last assistant message', () => { - const messages: readonly Message[] = [ - { - role: 'assistant', - content: 'response', - toolCalls: [{ id: 'tc1', name: 'read', arguments: '{}' }], - startTime: '2024-01-01T00:00:00Z', - durationMs: 500, - }, - ]; - const result = trimOutputMessages(messages, 1); - expect(result).toEqual([{ role: 'assistant', content: 'response' }]); - expect(result[0]).not.toHaveProperty('toolCalls'); - expect(result[0]).not.toHaveProperty('startTime'); - expect(result[0]).not.toHaveProperty('durationMs'); - }); - }); - - describe('outputMessages = N (numeric)', () => { - it('returns last N messages (any role) trimmed to { role, content }', () => { - const result = trimOutputMessages(makeMessages(), 3); - expect(result).toEqual([ - { role: 'assistant', content: 'Hi there' }, - { role: 'tool', content: 'file contents' }, - { role: 'assistant', content: 'Done!' }, - ]); - }); - - it('returns all messages when N exceeds message count', () => { - const result = trimOutputMessages(makeMessages(), 100); - expect(result).toHaveLength(4); - expect(result[0]).toEqual({ role: 'user', content: 'Hello' }); - }); - - it('strips metadata from all returned messages', () => { - const result = trimOutputMessages(makeMessages(), 2); - for (const msg of result) { - expect(Object.keys(msg).sort()).toEqual(['content', 'role']); - } - }); - }); - - describe('outputMessages = "all"', () => { - it('returns all messages trimmed to { role, content }', () => { - const result = trimOutputMessages(makeMessages(), 'all'); - expect(result).toHaveLength(4); - expect(result).toEqual([ - { role: 'user', content: 'Hello' }, - { role: 'assistant', content: 'Hi there' }, - { role: 'tool', content: 'file contents' }, - { role: 'assistant', content: 'Done!' }, - ]); - }); - - it('strips all metadata fields from every message', () => { - const result = trimOutputMessages(makeMessages(), 'all'); - for (const msg of result) { - expect(msg).not.toHaveProperty('toolCalls'); - expect(msg).not.toHaveProperty('startTime'); - expect(msg).not.toHaveProperty('durationMs'); - expect(msg).not.toHaveProperty('name'); - } - }); + it('leaves final-answer output unchanged', () => { + expect(trimOutputMessages('Done!', 1)).toBe('Done!'); + expect(trimOutputMessages('Done!', 3)).toBe('Done!'); + expect(trimOutputMessages('Done!', 'all')).toBe('Done!'); }); - describe('edge cases', () => { - it('handles empty output array', () => { - const empty: readonly Message[] = []; - expect(trimOutputMessages(empty, 1)).toEqual([]); - expect(trimOutputMessages(empty, 5)).toEqual([]); - expect(trimOutputMessages(empty, 'all')).toEqual([]); - }); + it('preserves empty final-answer output', () => { + expect(trimOutputMessages('', 1)).toBe(''); + expect(trimOutputMessages('', 'all')).toBe(''); }); }); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 2d8cd1df7..19c0e4be4 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -29,7 +29,7 @@ const CLAUDE_CLI_RESULT = { { text: 'Correct answer', passed: true, evidence: 'Matched expected output' }, { text: 'Used reasoning', passed: true }, ], - output: [{ role: 'assistant', content: 'The answer is 42, derived through extended thinking.' }], + output: 'The answer is 42, derived through extended thinking.', target: 'claude-cli', scores: [ { @@ -66,7 +66,7 @@ const CODEX_RESULT = { { text: 'File edited correctly', passed: true }, { text: 'No extra changes', passed: true }, ], - output: [{ role: 'assistant', content: 'Applied the requested edit to src/main.ts.' }], + output: 'Applied the requested edit to src/main.ts.', target: 'codex', scores: [ { @@ -102,7 +102,7 @@ const COPILOT_RESULT = { { text: 'Code completion correct', passed: true }, { text: 'Follows style guide', passed: false, evidence: 'Missing semicolons' }, ], - output: [{ role: 'assistant', content: 'function add(a, b) { return a + b }' }], + output: 'function add(a, b) { return a + b }', target: 'copilot-cli', scores: [ { @@ -131,7 +131,7 @@ const PI_RESULT = { { text: 'Refactored correctly', passed: true }, { text: 'Tests pass', passed: false, evidence: 'Test suite has 1 failure' }, ], - output: [{ role: 'assistant', content: 'Refactored the module to use dependency injection.' }], + output: 'Refactored the module to use dependency injection.', target: 'pi-coding-agent', duration_ms: 15000, token_usage: { input: 4000, output: 2000 }, @@ -146,7 +146,7 @@ const LLM_AZURE_RESULT = { suite: 'multi-provider', score: 1.0, assertions: [{ text: 'Analysis correct', passed: true }], - output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }], + output: 'The code has a race condition in the connection pool.', target: 'azure-o4-mini', scores: [ { @@ -169,7 +169,7 @@ const LLM_GPT_RESULT = { suite: 'multi-provider', score: 0.8, assertions: [{ text: 'Analysis correct', passed: true }], - output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }], + output: 'There might be a concurrency issue.', target: 'gpt-4.1', duration_ms: 2800, token_usage: { input: 1200, output: 400 }, @@ -184,7 +184,7 @@ const MINIMAL_RESULT = { suite: 'multi-provider', score: 0.5, assertions: [{ text: 'Exists', passed: true }], - output: [{ role: 'assistant', content: 'Response.' }], + output: 'Response.', target: 'mock', execution_status: 'ok', }; @@ -196,7 +196,7 @@ const ERROR_RESULT = { suite: 'multi-provider', score: 0, assertions: [], - output: [], + output: '', target: 'claude-cli', error: 'Agent timed out after 120s', duration_ms: 120000, @@ -527,7 +527,7 @@ describe('export e2e — multi-provider metrics verification', () => { // ── Output artifact tests ────────────────────────────────────────────── - describe('/outputs/response.md — human-readable agent responses', () => { + describe('/outputs/answer.md — human-readable agent responses', () => { it('should write answer text for each provider as markdown', async () => { const outputDir = path.join(tempDir, 'outputs'); const content = toJsonl(CLAUDE_CLI_RESULT, CODEX_RESULT, COPILOT_RESULT); @@ -536,24 +536,24 @@ describe('export e2e — multi-provider metrics verification', () => { expect( readFileSync( - path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'response.md'), + path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'answer.md'), 'utf8', ), - ).toBe('@[assistant]:\nThe answer is 42, derived through extended thinking.'); + ).toBe('The answer is 42, derived through extended thinking.'); expect( readFileSync( - path.join(artifactDir(outputDir, CODEX_RESULT), 'outputs', 'response.md'), + path.join(artifactDir(outputDir, CODEX_RESULT), 'outputs', 'answer.md'), 'utf8', ), - ).toBe('@[assistant]:\nApplied the requested edit to src/main.ts.'); + ).toBe('Applied the requested edit to src/main.ts.'); expect( readFileSync( - path.join(artifactDir(outputDir, COPILOT_RESULT), 'outputs', 'response.md'), + path.join(artifactDir(outputDir, COPILOT_RESULT), 'outputs', 'answer.md'), 'utf8', ), - ).toBe('@[assistant]:\nfunction add(a, b) { return a + b }'); + ).toBe('function add(a, b) { return a + b }'); }); it('should not write output file for error result with empty answer', async () => { @@ -563,7 +563,7 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); expect( - existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'outputs', 'response.md')), + existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'outputs', 'answer.md')), ).toBe(false); }); }); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 75f599e33..13a7b49eb 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -26,7 +26,7 @@ const RESULT_FULL = { { text: 'Says hello', passed: true }, { text: 'Uses name', passed: true }, ], - output: [{ role: 'assistant', content: 'Hello, Alice!' }], + output: 'Hello, Alice!', target: 'gpt-4o', scores: [ { @@ -88,7 +88,7 @@ const RESULT_NO_TRACE = { suite: 'demo', score: 1.0, assertions: [{ text: 'Correct', passed: true }], - output: [{ role: 'assistant', content: 'Yes.' }], + output: 'Yes.', target: 'default', token_usage: { input: 50, output: 20 }, cost_usd: 0.001, @@ -210,7 +210,10 @@ describe('results export', () => { execution_status: 'ok', grading_path: 'demo/test-greeting/grading.json', timing_path: 'demo/test-greeting/timing.json', - output_path: 'demo/test-greeting/outputs/response.md', + output_path: 'demo/test-greeting/outputs/answer.md', + answer_path: 'demo/test-greeting/outputs/answer.md', + response_path: 'demo/test-greeting/outputs/response.md', + transcript_path: 'demo/test-greeting/outputs/transcript.jsonl', input_path: 'demo/test-greeting/input.md', }); }); @@ -270,15 +273,19 @@ describe('results export', () => { expect(existsSync(perTestTimingPath)).toBe(true); }); - it('should write answer text to /outputs/response.md as human-readable markdown', async () => { + it('should write answer text to /outputs/answer.md as human-readable markdown', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL); await exportResults('test.jsonl', content, outputDir); - const answerPath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'response.md'); + const answerPath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'answer.md'); expect(existsSync(answerPath)).toBe(true); - expect(readFileSync(answerPath, 'utf8')).toBe('@[assistant]:\nHello, Alice!'); + expect(readFileSync(answerPath, 'utf8')).toBe('Hello, Alice!'); + + const responsePath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'response.md'); + expect(existsSync(responsePath)).toBe(true); + expect(readFileSync(responsePath, 'utf8')).toBe('Hello, Alice!'); }); it('should group results by target in benchmark.json', async () => { @@ -345,7 +352,7 @@ describe('results export', () => { const answerPath = path.join( artifactDir(outputDir, RESULT_DIFFERENT_TARGET), 'outputs', - 'response.md', + 'answer.md', ); expect(existsSync(answerPath)).toBe(false); }); diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts index e2040eeea..af2d9769f 100644 --- a/apps/cli/test/commands/results/report.test.ts +++ b/apps/cli/test/commands/results/report.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import vm from 'node:vm'; -import type { EvaluationResult, GraderResult } from '@agentv/core'; +import { type EvaluationResult, type GraderResult, buildTraceFromMessages } from '@agentv/core'; import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js'; import { @@ -29,13 +29,13 @@ function makeScore( } function makeResult(overrides: Partial = {}): EvaluationResult { - return { + const result = { timestamp: '2026-04-15T01:00:00.000Z', testId: 'test-1', suite: 'default', score: 1, assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }], - output: [{ role: 'assistant', content: 'answer' }], + output: 'answer', input: [{ role: 'user', content: 'question' }], target: 'default', executionStatus: 'ok', @@ -43,6 +43,21 @@ function makeResult(overrides: Partial = {}): EvaluationResult durationMs: 1200, ...overrides, }; + + return { + ...result, + trace: + result.trace ?? + buildTraceFromMessages({ + input: Array.isArray(result.input) ? result.input : [], + output: result.output ? [{ role: 'assistant', content: result.output }] : [], + finalOutput: result.output, + target: result.target, + testId: result.testId, + tokenUsage: result.tokenUsage, + durationMs: result.durationMs, + }), + }; } describe('results report', () => { diff --git a/apps/cli/test/commands/results/show.test.ts b/apps/cli/test/commands/results/show.test.ts index e4a625b2d..700d43b1f 100644 --- a/apps/cli/test/commands/results/show.test.ts +++ b/apps/cli/test/commands/results/show.test.ts @@ -13,7 +13,7 @@ const makeResult = (overrides: Partial = {}): EvaluationResult { text: "contains 'Dear'", passed: false, evidence: "'Dear' not found" }, { text: 'contains greeting', passed: true }, ], - output: [{ role: 'assistant', content: 'Hi there!' }], + output: 'Hi there!', input: [{ role: 'user', content: 'Give a formal greeting' }], executionStatus: 'success', durationMs: 1200, diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index ed091ef41..c0b5ec535 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -205,7 +205,12 @@ describe('trace utils', () => { expect(results).toHaveLength(1); expect(results[0].test_id).toBe('test-2'); - expect(results[0].trace).toBeUndefined(); + expect(results[0].trace).toMatchObject({ + schema_version: 'agentv.trace.v1', + event_count: 0, + messages: [], + events: [], + }); }); it('loads index.jsonl directly', () => { @@ -216,7 +221,12 @@ describe('trace utils', () => { expect(results).toHaveLength(1); expect(results[0].test_id).toBe('test-2'); - expect(results[0].trace).toBeUndefined(); + expect(results[0].trace).toMatchObject({ + schema_version: 'agentv.trace.v1', + event_count: 0, + messages: [], + events: [], + }); }); it('loads simple trace jsonl exports and keeps spans available for trace commands', () => { diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 5f92fee9b..ccb022e31 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -41,7 +41,8 @@ interface EvaluationResultLike { readonly passed: boolean; readonly evidence?: string; }[]; - readonly output: readonly { readonly role: string; readonly content: string }[]; + readonly output: string; + readonly trace: Record; readonly expectedAspectCount: number; readonly target: string; readonly timestamp: string; @@ -63,20 +64,53 @@ function evalCaseIds(evalCases: ReadonlyArray | undefined): readonly st .filter((id): id is string => id !== undefined); } +function buildTrace(targetName: string, testId: string, output: string): Record { + const message = { role: 'assistant', content: output }; + return { + schemaVersion: 'agentv.trace.v1', + eventCount: 2, + toolCalls: {}, + errorCount: 0, + llmCallCount: 1, + messages: [message], + events: [ + { + eventId: 'message-0', + ordinal: 0, + type: 'message', + message, + metadata: { message_index: 0 }, + }, + { + eventId: 'final-response', + parentEventId: 'message-0', + ordinal: 1, + type: 'final_response', + message, + metadata: { message_index: 0 }, + }, + ], + metadata: { provider: 'mock', target: targetName, eval_case_id: testId }, + }; +} + function buildResult(targetName: string, testId: string, index: number): EvaluationResultLike { const baseTime = new Date('2024-01-01T00:00:00.000Z'); if (testId === 'case-alpha') { + const output = 'Alpha answer'; return { testId: 'case-alpha', score: 0.6, assertions: [{ text: 'alpha', passed: true }], - output: [{ role: 'assistant', content: 'Alpha answer' }], + output, + trace: buildTrace(targetName, 'case-alpha', output), expectedAspectCount: 1, target: targetName, timestamp: baseTime.toISOString(), }; } if (testId === 'case-beta') { + const output = 'Beta answer'; return { testId: 'case-beta', score: 0.9, @@ -85,17 +119,20 @@ function buildResult(targetName: string, testId: string, index: number): Evaluat { text: 'gamma', passed: true }, { text: 'delta', passed: false }, ], - output: [{ role: 'assistant', content: 'Beta answer' }], + output, + trace: buildTrace(targetName, 'case-beta', output), expectedAspectCount: 3, target: targetName, timestamp: new Date(baseTime.getTime() + 60_000).toISOString(), }; } + const output = `${testId} answer`; return { testId, score: 1, assertions: [{ text: testId, passed: true }], - output: [{ role: 'assistant', content: `${testId} answer` }], + output, + trace: buildTrace(targetName, testId, output), expectedAspectCount: 1, target: targetName, timestamp: new Date(baseTime.getTime() + index * 60_000).toISOString(), diff --git a/packages/core/src/observability/otel-exporter.ts b/packages/core/src/observability/otel-exporter.ts index d3c400a9a..7a39635aa 100644 --- a/packages/core/src/observability/otel-exporter.ts +++ b/packages/core/src/observability/otel-exporter.ts @@ -216,8 +216,10 @@ export class OtelTraceExporter { rootSpan.setAttribute('agentv.trace.llm_call_count', t.llmCallCount); } - // Child spans from trace messages (--trace mode) - const traceMessages = result.trace.messages; + // Child spans from canonical trace messages. + // Some callers may still export older result artifacts while migrating, + // so tolerate a missing trace instead of crashing the exporter. + const traceMessages = result.trace?.messages ?? []; if (traceMessages.length > 0) { const parentCtx = api.trace.setSpan(api.context.active(), rootSpan); diff --git a/packages/core/test/evaluation/code-grader-multimodal.test.ts b/packages/core/test/evaluation/code-grader-multimodal.test.ts index 25f92711d..130eb5e0e 100644 --- a/packages/core/test/evaluation/code-grader-multimodal.test.ts +++ b/packages/core/test/evaluation/code-grader-multimodal.test.ts @@ -269,8 +269,9 @@ describe('CodeGrader multimodal integration', () => { expect(result.score).toBe(1.0); const details = result.details as Record; const payload = details.payload as Record; - const outputMsgs = payload.output as Record[]; - expect(outputMsgs[0].content).toBe('Hello world'); + expect(payload.output).toBe('answer'); + const messages = payload.messages as Record[]; + expect(messages[0].content).toBe('Hello world'); }); it('materializes image data URIs in output for grader', async () => { @@ -300,8 +301,9 @@ describe('CodeGrader multimodal integration', () => { // Verify the grader received the payload with image paths (not data URIs) const details = result.details as Record; const payload = details.payload as Record; - const outputMsgs = payload.output as Record[]; - const content = outputMsgs[0].content as Record[]; + expect(payload.output).toBe('answer'); + const messages = payload.messages as Record[]; + const content = messages[0].content as Record[]; // Text block preserved expect(content[0]).toEqual({ type: 'text', text: 'Generated chart:' }); diff --git a/packages/core/test/evaluation/conversation-mode.test.ts b/packages/core/test/evaluation/conversation-mode.test.ts index 2eeb8eee4..cdeca3c30 100644 --- a/packages/core/test/evaluation/conversation-mode.test.ts +++ b/packages/core/test/evaluation/conversation-mode.test.ts @@ -549,7 +549,7 @@ describe('runEvalCase — conversation mode', () => { expect(turn2Score?.score).toBe(1.0); }); - it('output contains full conversation transcript with all user and assistant messages', async () => { + it('output is the final answer while trace contains the full conversation transcript', async () => { const provider = new SequenceProvider('mock', [ assistantResponse('Answer 1'), assistantResponse('Answer 2'), @@ -574,10 +574,12 @@ describe('runEvalCase — conversation mode', () => { now: nowFn, }); - // Output should have all messages from the conversation - const output = result.output ?? []; - const userMessages = output.filter((m) => m.role === 'user'); - const assistantMessages = output.filter((m) => m.role === 'assistant'); + // Output is only the final answer/scored result. + expect(result.output).toBe('Answer 2'); + + // Trace preserves all messages from the conversation. + const userMessages = result.trace.messages.filter((m) => m.role === 'user'); + const assistantMessages = result.trace.messages.filter((m) => m.role === 'assistant'); expect(userMessages.length).toBe(2); expect(assistantMessages.length).toBe(2); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 29a339bf0..ebdb2fcee 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -801,10 +801,8 @@ describe('runEvalCase trace integration', () => { expect(result.trace?.errorCount).toBe(0); }); - it('omits trace when provider returns no output', async () => { - const provider = new TraceProvider('mock', { - output: [{ role: 'assistant', content: 'The weather is sunny' }], - }); + it('builds a canonical trace even when provider returns no output', async () => { + const provider = new TraceProvider('mock', {}); const result = await runEvalCase({ evalCase: traceTestCase, @@ -813,7 +811,10 @@ describe('runEvalCase trace integration', () => { evaluators: evaluatorRegistry, }); - expect(result.trace).toBeUndefined(); + expect(result.output).toBe(''); + expect(result.trace).toBeDefined(); + expect(result.trace.messages.map((message) => message.role)).toEqual(['user', 'assistant']); + expect(result.trace.events.some((event) => event.type === 'final_response')).toBe(true); }); it('includes trace when provider reports tokenUsage without output', async () => { @@ -907,7 +908,7 @@ describe('runEvalCase trace integration', () => { expect(result.scores?.[0]?.verdict).toBe('pass'); }); - it('fails tool-trajectory evaluator when no trace available', async () => { + it('fails tool-trajectory evaluator when the trace has no matching tools', async () => { const provider = new TraceProvider('mock', { output: [{ role: 'assistant', content: 'Result' }], }); @@ -944,7 +945,7 @@ describe('runEvalCase trace integration', () => { expect(result.score).toBe(0); expect(result.scores?.[0]?.verdict).toBe('fail'); expect(result.scores?.[0]?.assertions.filter((a) => !a.passed).map((a) => a.text)).toContain( - 'No trace available for evaluation', + 'search: called 0 times (required >=1)', ); }); diff --git a/packages/core/test/fixtures/test-define-grader.ts b/packages/core/test/fixtures/test-define-grader.ts index f5c41f75d..344a5b09c 100644 --- a/packages/core/test/fixtures/test-define-grader.ts +++ b/packages/core/test/fixtures/test-define-grader.ts @@ -7,8 +7,9 @@ import { defineCodeGrader } from '../../../eval/src/index.js'; export default defineCodeGrader(({ output, criteria }) => { const assertions: { text: string; passed: boolean }[] = []; - // Extract text from the output message array - const candidateText = (output ?? []).map((m) => String(m.content ?? '')).join(' '); + // `output` is the final answer/scored result. Transcript-aware graders should + // use messages/trace instead. + const candidateText = output ?? ''; // Simple check: does candidate mention the criteria keywords? const outcomeWords = criteria.toLowerCase().split(/\s+/); diff --git a/packages/core/test/fixtures/test-grader-with-details.cjs b/packages/core/test/fixtures/test-grader-with-details.cjs index b11c34d36..54e40eed2 100644 --- a/packages/core/test/fixtures/test-grader-with-details.cjs +++ b/packages/core/test/fixtures/test-grader-with-details.cjs @@ -7,9 +7,16 @@ const fs = require('node:fs'); const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_output); -// Extract candidate text from the output message array -const outputMessages = Array.isArray(input.output) ? input.output : []; -const candidateText = outputMessages.map((m) => String(m.content ?? '')).join(''); +// `output` is the final answer/scored result. Keep a tiny legacy fallback so +// this fixture can still explain failures if an old message-array payload leaks. +const candidateText = + typeof input.output === 'string' + ? input.output + : typeof input.answer === 'string' + ? input.answer + : Array.isArray(input.output) + ? input.output.map((m) => String(m.content ?? '')).join('') + : ''; const hasCandidate = candidateText.length > 0; // Emit details with structured metrics diff --git a/packages/core/test/fixtures/test-grader.cjs b/packages/core/test/fixtures/test-grader.cjs index e341fb69f..5e042b807 100644 --- a/packages/core/test/fixtures/test-grader.cjs +++ b/packages/core/test/fixtures/test-grader.cjs @@ -4,11 +4,18 @@ const fs = require('node:fs'); const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_output); -// Extract candidate text from the output message array -const outputMessages = Array.isArray(input.output) ? input.output : []; -const candidateText = outputMessages - .map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content))) - .join(''); +// `output` is the final answer/scored result. Keep a tiny legacy fallback so +// this fixture can still explain failures if an old message-array payload leaks. +const candidateText = + typeof input.output === 'string' + ? input.output + : typeof input.answer === 'string' + ? input.answer + : Array.isArray(input.output) + ? input.output + .map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content))) + .join('') + : ''; const hasCandidate = candidateText.length > 0; let candidateDecisionOk = false; diff --git a/packages/core/test/observability/otel-exporter.test.ts b/packages/core/test/observability/otel-exporter.test.ts index 9e2035a1a..c10ef1ba0 100644 --- a/packages/core/test/observability/otel-exporter.test.ts +++ b/packages/core/test/observability/otel-exporter.test.ts @@ -4,6 +4,7 @@ */ import { afterEach, describe, expect, it } from 'bun:test'; +import { buildTraceFromMessages } from '../../src/evaluation/trace.js'; import { OTEL_BACKEND_PRESETS, OtelTraceExporter } from '../../src/observability/otel-exporter.js'; // --------------------------------------------------------------------------- @@ -225,7 +226,13 @@ describe('W3C traceparent propagation', () => { testId: 'test-tp', target: 'my-agent', score: 1, - output: [{ role: 'assistant' as const, content: 'ok' }], + output: 'ok', + trace: buildTraceFromMessages({ + output: [{ role: 'assistant' as const, content: 'ok' }], + finalOutput: 'ok', + target: 'my-agent', + testId: 'test-tp', + }), timestamp: new Date().toISOString(), }) as unknown as Parameters[0]; @@ -353,14 +360,20 @@ describe('Per-span token usage metrics', () => { target: 'my-agent', score: 1, timestamp: new Date().toISOString(), - output: [ - { - role: 'assistant', - content: 'hello', - metadata: { model: 'gpt-4' }, - tokenUsage: { input: 100, output: 50, cached: 25 }, - }, - ], + output: 'hello', + trace: buildTraceFromMessages({ + output: [ + { + role: 'assistant', + content: 'hello', + metadata: { model: 'gpt-4' }, + tokenUsage: { input: 100, output: 50, cached: 25 }, + }, + ], + finalOutput: 'hello', + target: 'my-agent', + testId: 'test-tokens', + }), } as unknown as Parameters[0]; await setup.exporter.exportResult(result); @@ -385,13 +398,19 @@ describe('Per-span token usage metrics', () => { target: 'my-agent', score: 1, timestamp: new Date().toISOString(), - output: [ - { - role: 'assistant', - content: 'hello', - metadata: { model: 'gpt-4' }, - }, - ], + output: 'hello', + trace: buildTraceFromMessages({ + output: [ + { + role: 'assistant', + content: 'hello', + metadata: { model: 'gpt-4' }, + }, + ], + finalOutput: 'hello', + target: 'my-agent', + testId: 'test-no-tokens', + }), } as unknown as Parameters[0]; await setup.exporter.exportResult(result); @@ -416,14 +435,20 @@ describe('Per-span token usage metrics', () => { target: 'my-agent', score: 1, timestamp: new Date().toISOString(), - output: [ - { - role: 'assistant', - content: 'hello', - metadata: { model: 'gpt-4' }, - tokenUsage: { input: 200, output: 75 }, - }, - ], + output: 'hello', + trace: buildTraceFromMessages({ + output: [ + { + role: 'assistant', + content: 'hello', + metadata: { model: 'gpt-4' }, + tokenUsage: { input: 200, output: 75 }, + }, + ], + finalOutput: 'hello', + target: 'my-agent', + testId: 'test-partial-tokens', + }), } as unknown as Parameters[0]; await setup.exporter.exportResult(result); diff --git a/packages/eval/test/define-code-grader.test.ts b/packages/eval/test/define-code-grader.test.ts index e09c0ba49..e17e93230 100644 --- a/packages/eval/test/define-code-grader.test.ts +++ b/packages/eval/test/define-code-grader.test.ts @@ -14,6 +14,16 @@ import { MessageSchema, } from '../src/schemas.js'; +const makeTrace = (overrides: Record = {}) => ({ + schemaVersion: 'agentv.trace.v1', + eventCount: 3, + toolCalls: { read: 2, write: 1 }, + errorCount: 0, + messages: [], + events: [], + ...overrides, +}); + // --------------------------------------------------------------------------- // Content schemas // --------------------------------------------------------------------------- @@ -180,11 +190,7 @@ describe('CodeGraderInputSchema', () => { it('accepts optional trace', () => { const inputWithTrace = { ...validInput, - trace: { - eventCount: 3, - toolCalls: { read: 2, write: 1 }, - errorCount: 0, - }, + trace: makeTrace(), }; const result = CodeGraderInputSchema.parse(inputWithTrace); expect(result.trace?.eventCount).toBe(3); @@ -209,10 +215,11 @@ describe('CodeGraderInputSchema', () => { expect(result.config).toEqual({ maxToolCalls: 10, strictMode: true }); }); - it('accepts optional output with toolCalls', () => { + it('accepts final output plus transcript messages with toolCalls', () => { const inputWithOutput = { ...validInput, - output: [ + output: 'Reading file...', + messages: [ { role: 'assistant', content: 'Reading file...', @@ -221,13 +228,15 @@ describe('CodeGraderInputSchema', () => { ], }; const result = CodeGraderInputSchema.parse(inputWithOutput); - expect(result.output?.[0].toolCalls?.[0].tool).toBe('read'); + expect(result.output).toBe('Reading file...'); + expect(result.messages?.[0].toolCalls?.[0].tool).toBe('read'); }); - it('accepts output with Content[] containing image blocks', () => { + it('accepts transcript messages with Content[] containing image blocks', () => { const inputWithImages = { ...validInput, - output: [ + output: 'Generated chart:', + messages: [ { role: 'assistant', content: [ @@ -238,7 +247,7 @@ describe('CodeGraderInputSchema', () => { ], }; const result = CodeGraderInputSchema.parse(inputWithImages); - const content = result.output?.[0].content as { type: string; path?: string }[]; + const content = result.messages?.[0].content as { type: string; path?: string }[]; expect(content).toHaveLength(2); expect(content[1].type).toBe('image'); expect(content[1].path).toBe('/workspace/chart.png'); diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts index 890b80201..2ed471c26 100644 --- a/packages/eval/test/define-prompt-template.test.ts +++ b/packages/eval/test/define-prompt-template.test.ts @@ -2,6 +2,16 @@ import { describe, expect, it } from 'bun:test'; import { PromptTemplateInputSchema } from '../src/schemas.js'; +const makeTrace = (overrides: Record = {}) => ({ + schemaVersion: 'agentv.trace.v1', + eventCount: 3, + toolCalls: { read: 2, write: 1 }, + errorCount: 0, + messages: [], + events: [], + ...overrides, +}); + describe('PromptTemplateInputSchema', () => { // Minimal valid input with all required fields const validInput = { @@ -29,11 +39,7 @@ describe('PromptTemplateInputSchema', () => { it('accepts optional trace', () => { const inputWithTrace = { ...validInput, - trace: { - eventCount: 3, - toolCalls: { read: 2, write: 1 }, - errorCount: 0, - }, + trace: makeTrace(), }; const result = PromptTemplateInputSchema.parse(inputWithTrace); expect(result.trace?.eventCount).toBe(3); @@ -85,10 +91,11 @@ describe('PromptTemplateInputSchema', () => { expect(result.input[0].content).toBe('What is 2+2?'); }); - it('accepts optional output with toolCalls', () => { + it('accepts final output plus transcript messages with toolCalls', () => { const inputWithOutput = { ...validInput, - output: [ + output: 'Reading file...', + messages: [ { role: 'assistant', content: 'Reading file...', @@ -97,21 +104,18 @@ describe('PromptTemplateInputSchema', () => { ], }; const result = PromptTemplateInputSchema.parse(inputWithOutput); - expect(result.output?.[0].toolCalls?.[0].tool).toBe('read'); + expect(result.output).toBe('Reading file...'); + expect(result.messages?.[0].toolCalls?.[0].tool).toBe('read'); }); it('accepts full input with all fields', () => { const fullInput = { criteria: 'The answer should be 4', expectedOutput: [{ role: 'assistant', content: '4' }], - output: [{ role: 'assistant', content: 'The answer is 4' }], + output: 'The answer is 4', inputFiles: ['/path/to/input.txt'], input: [{ role: 'user', content: 'What is 2+2?' }], - trace: { - eventCount: 1, - toolCalls: {}, - errorCount: 0, - }, + trace: makeTrace({ eventCount: 1, toolCalls: {} }), config: { rubric: 'Check correctness' }, }; const result = PromptTemplateInputSchema.parse(fullInput); diff --git a/packages/eval/test/deprecation.test.ts b/packages/eval/test/deprecation.test.ts index e025fd973..6a63b03c7 100644 --- a/packages/eval/test/deprecation.test.ts +++ b/packages/eval/test/deprecation.test.ts @@ -23,15 +23,17 @@ describe('enrichInput — pass-through', () => { expect(result).toBe(input); }); - it('structured fields (input, output, expectedOutput) remain Message[]', () => { + it('structured fields (input, messages, expectedOutput) remain transcript arrays', () => { const input = buildInput({ input: [{ role: 'user', content: 'Hello' }], - output: [{ role: 'assistant', content: 'Hi' }], + output: 'Hi', + messages: [{ role: 'assistant', content: 'Hi' }], expectedOutput: [{ role: 'assistant', content: 'Hi there' }], }); enrichInput(input); expect(Array.isArray(input.input)).toBe(true); - expect(Array.isArray(input.output)).toBe(true); + expect(input.output).toBe('Hi'); + expect(Array.isArray(input.messages)).toBe(true); expect(Array.isArray(input.expectedOutput)).toBe(true); }); }); diff --git a/packages/eval/test/file-backed-output.test.ts b/packages/eval/test/file-backed-output.test.ts index 58e931f3e..27de99630 100644 --- a/packages/eval/test/file-backed-output.test.ts +++ b/packages/eval/test/file-backed-output.test.ts @@ -48,12 +48,9 @@ describe('Lazy file-backed output loading', () => { }); it('lazily loads output from file when outputPath is set', () => { - const messages = [ - { role: 'assistant', content: 'Hello from file' }, - { role: 'user', content: 'Test' }, - ]; + const answer = 'Hello from file'; const filePath = join(tmpDir, 'output.json'); - writeFileSync(filePath, JSON.stringify(messages)); + writeFileSync(filePath, JSON.stringify(answer)); const input: CodeGraderInput = CodeGraderInputSchema.parse({ criteria: 'test', @@ -79,8 +76,7 @@ describe('Lazy file-backed output loading', () => { // First access triggers file read const output = input.output; - expect(output).toHaveLength(2); - expect(output?.[0].content).toBe('Hello from file'); + expect(output).toBe('Hello from file'); // Second access uses cache const output2 = input.output; @@ -91,13 +87,12 @@ describe('Lazy file-backed output loading', () => { const input: CodeGraderInput = CodeGraderInputSchema.parse({ criteria: 'test', expectedOutput: [], - output: [{ role: 'assistant', content: 'inline' }], + output: 'inline', inputFiles: [], input: [], }); // No lazy loading needed — output is already present - expect(input.output).toHaveLength(1); - expect(input.output?.[0].content).toBe('inline'); + expect(input.output).toBe('inline'); }); }); From 34dc821fbdb0c95cfa1f98f5bf6dccbdae58192d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 12 Jun 2026 11:37:22 +0200 Subject: [PATCH 7/7] fix(evaluation): pass final output to prompt templates --- .../evaluation/graders/prompt-resolution.ts | 10 ++- .../graders/prompt-resolution.test.ts | 73 +++++++++++++++++++ .../core/test/evaluation/orchestrator.test.ts | 4 +- 3 files changed, 82 insertions(+), 5 deletions(-) diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts index b31717047..2306c4b56 100644 --- a/packages/core/src/evaluation/graders/prompt-resolution.ts +++ b/packages/core/src/evaluation/graders/prompt-resolution.ts @@ -17,7 +17,7 @@ import { toSnakeCaseDeep } from '../case-conversion.js'; import { readTextFile } from '../file-utils.js'; import type { Message } from '../providers/types.js'; import { VALID_TEMPLATE_VARIABLES } from '../template-variables.js'; -import type { TraceSummary } from '../trace.js'; +import type { Trace } from '../trace.js'; import type { EvalTest, PromptScriptConfig } from '../types.js'; import { executeScript } from './code-grader.js'; @@ -25,7 +25,7 @@ export interface ResolveCustomPromptContext { readonly evalCase: EvalTest; readonly candidate: string; readonly output?: readonly Message[]; - readonly trace?: TraceSummary; + readonly trace?: Trace; readonly config?: Record; readonly fileChanges?: string; readonly workspacePath?: string; @@ -97,10 +97,14 @@ async function executePromptTemplate( config?: Record, timeoutMs?: number, ): Promise { + const messages = context.trace?.messages ?? context.output ?? []; + const payload = { criteria: context.evalCase.criteria, expectedOutput: context.evalCase.expected_output, - output: context.output ?? null, + output: context.candidate, + answer: context.candidate, + messages, inputFiles: context.evalCase.file_paths, input: context.evalCase.input, metadata: context.evalCase.metadata ?? null, diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts index 1c17cec2f..731a28dde 100644 --- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts +++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts @@ -1,9 +1,16 @@ import { describe, expect, it } from 'bun:test'; +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { fileURLToPath, pathToFileURL } from 'node:url'; import { containsTemplateVariables, resolveCustomPrompt, } from '../../../src/evaluation/graders/prompt-resolution.js'; +import { buildTraceFromMessages } from '../../../src/evaluation/trace.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); describe('containsTemplateVariables', () => { it('returns true for template with {{output}}', () => { @@ -82,4 +89,70 @@ describe('resolveCustomPrompt', () => { }); expect(result).toBeUndefined(); }); + + it('passes final answer as output and transcript through messages/trace to executable prompts', async () => { + const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-contract-')); + const promptPath = path.join(tmpDir, 'prompt-template.ts'); + const promptTemplateRuntime = pathToFileURL( + path.resolve(__dirname, '../../../../eval/src/prompt-template.ts'), + ).href; + + writeFileSync( + promptPath, + `import { definePromptTemplate } from ${JSON.stringify(promptTemplateRuntime)}; + +definePromptTemplate((ctx) => { + if (typeof ctx.output !== 'string') { + throw new Error('expected output to be the final answer string'); + } + if (ctx.output !== 'Final answer') { + throw new Error('unexpected final answer: ' + ctx.output); + } + if (ctx.answer !== ctx.output) { + throw new Error('answer should mirror output'); + } + if (!Array.isArray(ctx.messages) || ctx.messages.length < 2) { + throw new Error('expected transcript messages'); + } + if (!ctx.messages.some((message) => message.role === 'assistant' && message.content === 'Trace assistant turn')) { + throw new Error('expected transcript message from trace'); + } + if (!ctx.trace || !Array.isArray(ctx.trace.messages) || ctx.trace.messages.length !== ctx.messages.length) { + throw new Error('expected full trace with transcript messages'); + } + + return \`Final: \${ctx.output}; messages: \${ctx.messages.length}; trace: \${ctx.trace.messages.length}\`; +}); +`, + ); + + const trace = buildTraceFromMessages({ + input: [{ role: 'user', content: 'Question?' }], + output: [{ role: 'assistant', content: 'Trace assistant turn' }], + finalOutput: 'Final answer', + target: 'mock', + testId: 'prompt-contract', + }); + + const result = await resolveCustomPrompt( + { + resolvedPromptScript: [process.execPath, 'run', promptPath], + }, + { + evalCase: { + id: 'prompt-contract', + input: [{ role: 'user', content: 'Question?' }], + expected_output: [{ role: 'assistant', content: 'Expected answer' }], + file_paths: [], + criteria: 'Check final answer.', + }, + candidate: 'Final answer', + output: [{ role: 'assistant', content: 'Legacy transcript fallback' }], + trace, + }, + 5_000, + ); + + expect(result).toBe('Final: Final answer; messages: 2; trace: 2'); + }); }); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index ebdb2fcee..3511218f8 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1153,7 +1153,7 @@ describe('runEvalCase trace integration', () => { const stdin = readFileSync(0, 'utf8'); const input = JSON.parse(stdin); const question = (input.input || []).map((m) => String(m.content ?? '')).join('\\n'); -const answer = (input.output || []).map((m) => String(m.content ?? '')).join('\\n'); +const answer = String(input.output ?? ''); const ref = (input.expected_output || []).map((m) => String(m.content ?? '')).join('\\n') || 'none'; console.log(\`Question: \${question} Answer: \${answer} @@ -1223,7 +1223,7 @@ Reference: \${ref}\`); const stdin = fs.readFileSync(0, 'utf8'); const input = JSON.parse(stdin); const question = (input.input || []).map((m) => String(m.content || '')).join('\\n'); -const answer = (input.output || []).map((m) => String(m.content || '')).join('\\n'); +const answer = String(input.output || ''); console.log('Question: ' + question + '\\nAnswer: ' + answer); `, );