From 06a69e8543aac67596af1e01f0e7c3452636827e Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 07:24:42 +0200
Subject: [PATCH 1/7] feat(trace): add canonical evaluation trace model

---
 packages/core/src/evaluation/trace.ts | 496 +++++++++++++++++++++-----
 packages/core/src/import/index.ts     |   2 +
 packages/core/src/import/types.ts     |  59 +++
 packages/eval/src/index.ts            |  37 +-
 packages/eval/src/schemas.ts          | 107 +++++-
 5 files changed, 598 insertions(+), 103 deletions(-)

diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts
index d6d4f8210..1541961db 100644
--- a/packages/core/src/evaluation/trace.ts
+++ b/packages/core/src/evaluation/trace.ts
@@ -1,19 +1,21 @@
 /**
  * Trace models for evaluation-time agent behavior.
  *
- * This module separates the canonical trace contract from compatibility views:
- * - NormalizedTrajectory is the full, versioned trajectory contract that importers,
- *   replay, and trajectory-aware graders should use as the source of truth.
- * - TraceSummary is a derived compact read model used by existing graders, result
- *   artifacts, and CLI/dashboard aggregation. When a full trajectory exists, do
- *   not author TraceSummary independently; derive it with
- *   computeTraceSummaryFromTrajectory().
+ * `Trace` is AgentV's canonical normalized execution model. Evaluation results
+ * keep `output` as the final answer/scored result only; the full transcript,
+ * tool calls/results, errors, timing, usage, provider/session provenance, and
+ * replay/eval metrics live in `trace`.
  *
- * Keep TypeScript internals camelCase. Persisted trajectory artifacts use the
- * snake_case NormalizedTrajectoryWire shape and must pass through the converters
- * in this file.
+ * `TraceSummary` is a derived compact read model for metric-style graders and
+ * aggregation. Derive it from `Trace.messages`/`Trace.events`; do not treat it
+ * as the canonical trace.
+ *
+ * TypeScript internals are camelCase. Persisted JSON/JSONL wire shapes are
+ * snake_case and must pass through the converters in this file or a boundary
+ * serializer such as `toSnakeCaseDeep()`.
  */
 import { z } from 'zod';
+import type { Message } from './providers/types.js';
 
 export const NORMALIZED_TRAJECTORY_SCHEMA_VERSION = 'agentv.trace.v1' as const;
 
@@ -32,19 +34,36 @@ export const NORMALIZED_TRACE_EVENT_TYPES = [
   'model_turn',
   'tool_call',
   'tool_result',
+  'final_response',
+  'error',
 ] as const;
 
 export const NORMALIZED_TOOL_STATUSES = ['ok', 'error', 'timeout', 'cancelled', 'unknown'] as const;
 
 export const NORMALIZED_REDACTION_LEVELS = ['none', 'partial', 'full'] as const;
 
-export type NormalizedTraceSourceKind = (typeof NORMALIZED_TRACE_SOURCE_KINDS)[number];
-export type NormalizedTraceEventType = (typeof NORMALIZED_TRACE_EVENT_TYPES)[number];
-export type NormalizedToolStatus = (typeof NORMALIZED_TOOL_STATUSES)[number];
-export type NormalizedRedactionLevel = (typeof NORMALIZED_REDACTION_LEVELS)[number];
-
-export interface NormalizedTraceSource {
-  readonly kind: NormalizedTraceSourceKind;
+export const TRACE_SCHEMA_VERSION = NORMALIZED_TRAJECTORY_SCHEMA_VERSION;
+export const TRACE_SOURCE_KINDS = NORMALIZED_TRACE_SOURCE_KINDS;
+export const TRACE_EVENT_TYPES = NORMALIZED_TRACE_EVENT_TYPES;
+export const TRACE_TOOL_STATUSES = NORMALIZED_TOOL_STATUSES;
+export const TRACE_REDACTION_LEVELS = NORMALIZED_REDACTION_LEVELS;
+
+export type TraceSourceKind = (typeof TRACE_SOURCE_KINDS)[number];
+export type TraceEventType = (typeof TRACE_EVENT_TYPES)[number];
+export type TraceToolStatus = (typeof TRACE_TOOL_STATUSES)[number];
+export type TraceRedactionLevel = (typeof TRACE_REDACTION_LEVELS)[number];
+
+/** @deprecated Use TraceSourceKind. */
+export type NormalizedTraceSourceKind = TraceSourceKind;
+/** @deprecated Use TraceEventType. */
+export type NormalizedTraceEventType = TraceEventType;
+/** @deprecated Use TraceToolStatus. */
+export type NormalizedToolStatus = TraceToolStatus;
+/** @deprecated Use TraceRedactionLevel. */
+export type NormalizedRedactionLevel = TraceRedactionLevel;
+
+export interface TraceSource {
+  readonly kind: TraceSourceKind;
   readonly path?: string;
   readonly url?: string;
   readonly provider?: string;
@@ -53,7 +72,7 @@ export interface NormalizedTraceSource {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedTraceSession {
+export interface TraceSession {
   readonly sessionId?: string;
   readonly conversationId?: string;
   readonly cwd?: string;
@@ -62,7 +81,7 @@ export interface NormalizedTraceSession {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedTraceBranch {
+export interface TraceBranch {
   readonly selectedLeafId?: string;
   readonly selectedPathIds?: readonly string[];
   readonly includedEventIds?: readonly string[];
@@ -70,7 +89,7 @@ export interface NormalizedTraceBranch {
   readonly selectionReason?: string;
 }
 
-export interface NormalizedTraceSourceRef {
+export interface TraceSourceRef {
   readonly eventId?: string;
   readonly messageId?: string;
   readonly spanId?: string;
@@ -81,7 +100,7 @@ export interface NormalizedTraceSourceRef {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedRawEvidence {
+export interface TraceRawEvidence {
   readonly kind: string;
   readonly ref?: string;
   readonly mediaType?: string;
@@ -90,13 +109,13 @@ export interface NormalizedRawEvidence {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedRedactionState {
-  readonly level: NormalizedRedactionLevel;
+export interface TraceRedactionState {
+  readonly level: TraceRedactionLevel;
   readonly fields?: readonly string[];
   readonly reason?: string;
 }
 
-export interface NormalizedTraceError {
+export interface TraceError {
   readonly message: string;
   readonly name?: string;
   readonly code?: string;
@@ -104,16 +123,16 @@ export interface NormalizedTraceError {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedTraceMessage {
+export interface TraceMessage {
   readonly role: string;
   readonly name?: string;
   readonly content?: unknown;
-  readonly redaction?: NormalizedRedactionState;
+  readonly redaction?: TraceRedactionState;
   readonly tokenUsage?: TokenUsage;
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedTraceModel {
+export interface TraceModel {
   readonly provider?: string;
   readonly name?: string;
   readonly invocationId?: string;
@@ -121,48 +140,49 @@ export interface NormalizedTraceModel {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedTraceTool {
+export interface TraceTool {
   readonly name: string;
   readonly callId?: string;
   readonly input?: unknown;
   readonly output?: unknown;
-  readonly status?: NormalizedToolStatus;
-  readonly error?: NormalizedTraceError;
-  readonly redaction?: NormalizedRedactionState;
+  readonly status?: TraceToolStatus;
+  readonly error?: TraceError;
+  readonly redaction?: TraceRedactionState;
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
-export interface NormalizedTraceEvent {
+export interface TraceEvent {
   readonly eventId: string;
   readonly parentEventId?: string;
   readonly ordinal: number;
-  readonly type: NormalizedTraceEventType;
+  readonly type: TraceEventType;
   readonly timestamp?: string;
   readonly durationMs?: number;
   readonly durationInferred?: boolean;
   readonly turnIndex?: number;
-  readonly message?: NormalizedTraceMessage;
-  readonly model?: NormalizedTraceModel;
-  readonly tool?: NormalizedTraceTool;
-  readonly sourceRef?: NormalizedTraceSourceRef;
-  readonly rawEvidence?: readonly NormalizedRawEvidence[];
-  readonly redaction?: NormalizedRedactionState;
+  readonly message?: TraceMessage;
+  readonly model?: TraceModel;
+  readonly tool?: TraceTool;
+  readonly error?: TraceError;
+  readonly sourceRef?: TraceSourceRef;
+  readonly rawEvidence?: readonly TraceRawEvidence[];
+  readonly redaction?: TraceRedactionState;
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
 /**
- * Canonical in-memory trajectory model.
+ * Legacy imported trace artifact shape used by older import/replay helpers.
  *
- * Persisted trajectory artifacts are the snake_case wire shape below. They do
- * not embed TraceSummary because compact summaries are one-way projections from
- * this full event stream.
+ * New evaluation results use `Trace` below: final answer in `output`, full
+ * transcript under `trace.messages`, structured spans under `trace.events`, and
+ * provider-native session identifiers in `trace.metadata`.
  */
-export interface NormalizedTrajectory {
+export interface TraceArtifact {
   readonly schemaVersion: typeof NORMALIZED_TRAJECTORY_SCHEMA_VERSION;
-  readonly source: NormalizedTraceSource;
-  readonly session: NormalizedTraceSession;
-  readonly branch?: NormalizedTraceBranch;
-  readonly events: readonly NormalizedTraceEvent[];
+  readonly source: TraceSource;
+  readonly session: TraceSession;
+  readonly branch?: TraceBranch;
+  readonly events: readonly TraceEvent[];
   readonly tokenUsage?: TokenUsage;
   readonly costUsd?: number;
   readonly durationMs?: number;
@@ -171,6 +191,12 @@ export interface NormalizedTrajectory {
   readonly metadata?: Readonly<Record<string, unknown>>;
 }
 
+/**
+ * @deprecated Use `Trace` for evaluation results or `TraceArtifact` for legacy
+ * import/replay artifacts.
+ */
+export type NormalizedTrajectory = TraceArtifact;
+
 function omitUndefinedProperties<T extends Record<string, unknown>>(value: T): T {
   return Object.fromEntries(
     Object.entries(value).filter(([, property]) => property !== undefined),
@@ -286,6 +312,7 @@ export const NormalizedTraceEventWireSchema = z.object({
   message: NormalizedTraceMessageWireSchema.optional(),
   model: NormalizedTraceModelWireSchema.optional(),
   tool: NormalizedTraceToolWireSchema.optional(),
+  error: NormalizedTraceErrorWireSchema.optional(),
   source_ref: NormalizedTraceSourceRefWireSchema.optional(),
   raw_evidence: z.array(NormalizedRawEvidenceWireSchema).optional(),
   redaction: NormalizedRedactionStateWireSchema.optional(),
@@ -309,9 +336,23 @@ export const NormalizedTrajectoryWireSchema = z.object({
 export type NormalizedTrajectoryWire = z.infer<typeof NormalizedTrajectoryWireSchema>;
 export type NormalizedTraceEventWire = z.infer<typeof NormalizedTraceEventWireSchema>;
 
-export function toNormalizedTrajectoryWire(
-  trajectory: NormalizedTrajectory,
-): NormalizedTrajectoryWire {
+export const TraceRedactionStateWireSchema = NormalizedRedactionStateWireSchema;
+export const TraceErrorWireSchema = NormalizedTraceErrorWireSchema;
+export const TraceSourceWireSchema = NormalizedTraceSourceWireSchema;
+export const TraceSessionWireSchema = NormalizedTraceSessionWireSchema;
+export const TraceBranchWireSchema = NormalizedTraceBranchWireSchema;
+export const TraceSourceRefWireSchema = NormalizedTraceSourceRefWireSchema;
+export const TraceRawEvidenceWireSchema = NormalizedRawEvidenceWireSchema;
+export const TraceMessageWireSchema = NormalizedTraceMessageWireSchema;
+export const TraceModelWireSchema = NormalizedTraceModelWireSchema;
+export const TraceToolWireSchema = NormalizedTraceToolWireSchema;
+export const TraceEventWireSchema = NormalizedTraceEventWireSchema;
+export const TraceArtifactWireSchema = NormalizedTrajectoryWireSchema;
+
+export type TraceArtifactWire = NormalizedTrajectoryWire;
+export type TraceEventWire = NormalizedTraceEventWire;
+
+export function toNormalizedTrajectoryWire(trajectory: TraceArtifact): NormalizedTrajectoryWire {
   return NormalizedTrajectoryWireSchema.parse(
     omitUndefinedProperties({
       schema_version: trajectory.schemaVersion,
@@ -329,7 +370,7 @@ export function toNormalizedTrajectoryWire(
   );
 }
 
-export function fromNormalizedTrajectoryWire(input: unknown): NormalizedTrajectory {
+export function fromNormalizedTrajectoryWire(input: unknown): TraceArtifact {
   const wire = NormalizedTrajectoryWireSchema.parse(input);
 
   return {
@@ -347,7 +388,15 @@ export function fromNormalizedTrajectoryWire(input: unknown): NormalizedTrajecto
   };
 }
 
-function toNormalizedTraceSourceWire(source: NormalizedTraceSource) {
+export function toTraceArtifactWire(artifact: TraceArtifact): TraceArtifactWire {
+  return toNormalizedTrajectoryWire(artifact);
+}
+
+export function fromTraceArtifactWire(input: unknown): TraceArtifact {
+  return fromNormalizedTrajectoryWire(input);
+}
+
+function toNormalizedTraceSourceWire(source: TraceSource) {
   return omitUndefinedProperties({
     kind: source.kind,
     path: source.path,
@@ -361,7 +410,7 @@ function toNormalizedTraceSourceWire(source: NormalizedTraceSource) {
 
 function fromNormalizedTraceSourceWire(
   source: z.infer<typeof NormalizedTraceSourceWireSchema>,
-): NormalizedTraceSource {
+): TraceSource {
   return {
     kind: source.kind,
     path: source.path,
@@ -373,7 +422,7 @@ function fromNormalizedTraceSourceWire(
   };
 }
 
-function toNormalizedTraceSessionWire(session: NormalizedTraceSession) {
+function toNormalizedTraceSessionWire(session: TraceSession) {
   return omitUndefinedProperties({
     session_id: session.sessionId,
     conversation_id: session.conversationId,
@@ -386,7 +435,7 @@ function toNormalizedTraceSessionWire(session: NormalizedTraceSession) {
 
 function fromNormalizedTraceSessionWire(
   session: z.infer<typeof NormalizedTraceSessionWireSchema>,
-): NormalizedTraceSession {
+): TraceSession {
   return {
     sessionId: session.session_id,
     conversationId: session.conversation_id,
@@ -397,7 +446,7 @@ function fromNormalizedTraceSessionWire(
   };
 }
 
-function toNormalizedTraceBranchWire(branch: NormalizedTraceBranch) {
+function toNormalizedTraceBranchWire(branch: TraceBranch) {
   return omitUndefinedProperties({
     selected_leaf_id: branch.selectedLeafId,
     selected_path_ids: branch.selectedPathIds,
@@ -409,7 +458,7 @@ function toNormalizedTraceBranchWire(branch: NormalizedTraceBranch) {
 
 function fromNormalizedTraceBranchWire(
   branch: z.infer<typeof NormalizedTraceBranchWireSchema>,
-): NormalizedTraceBranch {
+): TraceBranch {
   return {
     selectedLeafId: branch.selected_leaf_id,
     selectedPathIds: branch.selected_path_ids,
@@ -419,7 +468,7 @@ function fromNormalizedTraceBranchWire(
   };
 }
 
-function toNormalizedTraceEventWire(event: NormalizedTraceEvent): NormalizedTraceEventWire {
+function toNormalizedTraceEventWire(event: TraceEvent): NormalizedTraceEventWire {
   return NormalizedTraceEventWireSchema.parse(
     omitUndefinedProperties({
       event_id: event.eventId,
@@ -433,6 +482,7 @@ function toNormalizedTraceEventWire(event: NormalizedTraceEvent): NormalizedTrac
       message: event.message ? toNormalizedTraceMessageWire(event.message) : undefined,
       model: event.model ? toNormalizedTraceModelWire(event.model) : undefined,
       tool: event.tool ? toNormalizedTraceToolWire(event.tool) : undefined,
+      error: event.error ? toNormalizedTraceErrorWire(event.error) : undefined,
       source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : undefined,
       raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire),
       redaction: event.redaction,
@@ -441,7 +491,7 @@ function toNormalizedTraceEventWire(event: NormalizedTraceEvent): NormalizedTrac
   );
 }
 
-function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): NormalizedTraceEvent {
+function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): TraceEvent {
   return {
     eventId: event.event_id,
     parentEventId: event.parent_event_id,
@@ -454,6 +504,7 @@ function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): Normaliz
     message: event.message ? fromNormalizedTraceMessageWire(event.message) : undefined,
     model: event.model ? fromNormalizedTraceModelWire(event.model) : undefined,
     tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : undefined,
+    error: event.error ? fromNormalizedTraceErrorWire(event.error) : undefined,
     sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : undefined,
     rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire),
     redaction: event.redaction,
@@ -461,7 +512,7 @@ function fromNormalizedTraceEventWire(event: NormalizedTraceEventWire): Normaliz
   };
 }
 
-function toNormalizedTraceMessageWire(message: NormalizedTraceMessage) {
+function toNormalizedTraceMessageWire(message: TraceMessage) {
   return omitUndefinedProperties({
     role: message.role,
     name: message.name,
@@ -474,7 +525,7 @@ function toNormalizedTraceMessageWire(message: NormalizedTraceMessage) {
 
 function fromNormalizedTraceMessageWire(
   message: z.infer<typeof NormalizedTraceMessageWireSchema>,
-): NormalizedTraceMessage {
+): TraceMessage {
   return {
     role: message.role,
     name: message.name,
@@ -485,7 +536,7 @@ function fromNormalizedTraceMessageWire(
   };
 }
 
-function toNormalizedTraceModelWire(model: NormalizedTraceModel) {
+function toNormalizedTraceModelWire(model: TraceModel) {
   return omitUndefinedProperties({
     provider: model.provider,
     name: model.name,
@@ -497,7 +548,7 @@ function toNormalizedTraceModelWire(model: NormalizedTraceModel) {
 
 function fromNormalizedTraceModelWire(
   model: z.infer<typeof NormalizedTraceModelWireSchema>,
-): NormalizedTraceModel {
+): TraceModel {
   return {
     provider: model.provider,
     name: model.name,
@@ -507,7 +558,7 @@ function fromNormalizedTraceModelWire(
   };
 }
 
-function toNormalizedTraceToolWire(tool: NormalizedTraceTool) {
+function toNormalizedTraceToolWire(tool: TraceTool) {
   return omitUndefinedProperties({
     name: tool.name,
     call_id: tool.callId,
@@ -522,7 +573,7 @@ function toNormalizedTraceToolWire(tool: NormalizedTraceTool) {
 
 function fromNormalizedTraceToolWire(
   tool: z.infer<typeof NormalizedTraceToolWireSchema>,
-): NormalizedTraceTool {
+): TraceTool {
   return {
     name: tool.name,
     callId: tool.call_id,
@@ -535,7 +586,29 @@ function fromNormalizedTraceToolWire(
   };
 }
 
-function toNormalizedTraceSourceRefWire(sourceRef: NormalizedTraceSourceRef) {
+function toNormalizedTraceErrorWire(error: TraceError) {
+  return omitUndefinedProperties({
+    message: error.message,
+    name: error.name,
+    code: error.code,
+    stack: error.stack,
+    metadata: error.metadata,
+  });
+}
+
+function fromNormalizedTraceErrorWire(
+  error: z.infer<typeof NormalizedTraceErrorWireSchema>,
+): TraceError {
+  return {
+    message: error.message,
+    name: error.name,
+    code: error.code,
+    stack: error.stack,
+    metadata: error.metadata,
+  };
+}
+
+function toNormalizedTraceSourceRefWire(sourceRef: TraceSourceRef) {
   return omitUndefinedProperties({
     event_id: sourceRef.eventId,
     message_id: sourceRef.messageId,
@@ -550,7 +623,7 @@ function toNormalizedTraceSourceRefWire(sourceRef: NormalizedTraceSourceRef) {
 
 function fromNormalizedTraceSourceRefWire(
   sourceRef: z.infer<typeof NormalizedTraceSourceRefWireSchema>,
-): NormalizedTraceSourceRef {
+): TraceSourceRef {
   return {
     eventId: sourceRef.event_id,
     messageId: sourceRef.message_id,
@@ -563,7 +636,7 @@ function fromNormalizedTraceSourceRefWire(
   };
 }
 
-function toNormalizedRawEvidenceWire(evidence: NormalizedRawEvidence) {
+function toNormalizedRawEvidenceWire(evidence: TraceRawEvidence) {
   return omitUndefinedProperties({
     kind: evidence.kind,
     ref: evidence.ref,
@@ -576,7 +649,7 @@ function toNormalizedRawEvidenceWire(evidence: NormalizedRawEvidence) {
 
 function fromNormalizedRawEvidenceWire(
   evidence: z.infer<typeof NormalizedRawEvidenceWireSchema>,
-): NormalizedRawEvidence {
+): TraceRawEvidence {
   return {
     kind: evidence.kind,
     ref: evidence.ref,
@@ -587,6 +660,32 @@ function fromNormalizedRawEvidenceWire(
   };
 }
 
+// Deprecated compatibility names retained for callers that imported the older
+// normalized-trace terminology. New code should use the AgentV-owned Trace*
+// names above.
+/** @deprecated Use TraceSource. */
+export type NormalizedTraceSource = TraceSource;
+/** @deprecated Use TraceSession. */
+export type NormalizedTraceSession = TraceSession;
+/** @deprecated Use TraceBranch. */
+export type NormalizedTraceBranch = TraceBranch;
+/** @deprecated Use TraceSourceRef. */
+export type NormalizedTraceSourceRef = TraceSourceRef;
+/** @deprecated Use TraceRawEvidence. */
+export type NormalizedRawEvidence = TraceRawEvidence;
+/** @deprecated Use TraceRedactionState. */
+export type NormalizedRedactionState = TraceRedactionState;
+/** @deprecated Use TraceError. */
+export type NormalizedTraceError = TraceError;
+/** @deprecated Use TraceMessage. */
+export type NormalizedTraceMessage = TraceMessage;
+/** @deprecated Use TraceModel. */
+export type NormalizedTraceModel = TraceModel;
+/** @deprecated Use TraceTool. */
+export type NormalizedTraceTool = TraceTool;
+/** @deprecated Use TraceEvent. */
+export type NormalizedTraceEvent = TraceEvent;
+
 /**
  * Token usage metrics from provider execution.
  */
@@ -605,8 +704,8 @@ export interface TokenUsage {
  * Derived compact summary of a trace for lightweight persistence.
  *
  * This is a compatibility/read model for existing result artifacts and
- * aggregation. It is intentionally smaller than NormalizedTrajectory and should
- * not be treated as independently authored trace state when a full trajectory is
+ * aggregation. It is intentionally smaller than Trace and should
+ * not be treated as independently authored trace state when a full Trace is
  * available.
  */
 export interface TraceSummary {
@@ -622,6 +721,247 @@ export interface TraceSummary {
   readonly llmCallCount?: number;
 }
 
+/**
+ * Canonical trace attached to every evaluation result.
+ *
+ * The compact TraceSummary fields are mirrored for existing
+ * metric graders; `messages` and `events` are the complete canonical
+ * execution record. Result `output` is only the final answer; tools,
+ * intermediate assistant text, timing, usage, provider provenance, and replay
+ * metadata live here.
+ */
+export interface Trace extends TraceSummary {
+  readonly schemaVersion: typeof TRACE_SCHEMA_VERSION;
+  /** Complete normalized chat transcript used for transcript-aware graders. */
+  readonly messages: readonly Message[];
+  /** Structured event stream derived from the same messages and metrics. */
+  readonly events: readonly TraceEvent[];
+  readonly tokenUsage?: TokenUsage;
+  readonly costUsd?: number;
+  readonly durationMs?: number;
+  readonly startTime?: string;
+  readonly endTime?: string;
+  /** Provider/session/eval provenance. Provider-native IDs use metadata keys. */
+  readonly metadata?: Readonly<Record<string, unknown>>;
+}
+
+interface BuildTraceOptions {
+  readonly input?: readonly Message[];
+  readonly output?: readonly Message[];
+  readonly summary?: TraceSummary;
+  readonly finalOutput?: string;
+  readonly tokenUsage?: TokenUsage;
+  readonly costUsd?: number;
+  readonly durationMs?: number;
+  readonly startTime?: string;
+  readonly endTime?: string;
+  readonly provider?: string;
+  readonly target?: string;
+  readonly testId?: string;
+  readonly conversationId?: string;
+  readonly metadata?: Readonly<Record<string, unknown>>;
+  readonly error?: TraceError | string;
+}
+
+function sameMessageContent(first: Message | undefined, second: Message | undefined): boolean {
+  if (!first || !second) return false;
+  return (
+    first.role === second.role && JSON.stringify(first.content) === JSON.stringify(second.content)
+  );
+}
+
+function buildTraceMessages(
+  input: readonly Message[] | undefined,
+  output: readonly Message[] | undefined,
+): readonly Message[] {
+  const outputMessages = output ?? [];
+  if (outputMessages.length === 0) {
+    return input ?? [];
+  }
+
+  // Agent/transcript providers often return a full conversation (including the
+  // user/system turns). Single-shot LLM providers usually return only the final
+  // assistant message. Avoid duplicating the prompt when the provider already
+  // supplied a conversation-shaped transcript.
+  const outputLooksLikeFullTranscript = outputMessages.some(
+    (message) => message.role === 'user' || message.role === 'system',
+  );
+  if (outputLooksLikeFullTranscript) {
+    return outputMessages;
+  }
+
+  const inputMessages = input ?? [];
+  if (
+    inputMessages.length === 1 &&
+    outputMessages.length > 0 &&
+    sameMessageContent(inputMessages[0], outputMessages[0])
+  ) {
+    return outputMessages;
+  }
+  return [...inputMessages, ...outputMessages];
+}
+
+function toTraceMessage(message: Message): TraceMessage {
+  return {
+    role: message.role,
+    name: message.name,
+    content: message.content,
+    tokenUsage: message.tokenUsage,
+    metadata: message.metadata,
+  };
+}
+
+function toTraceError(error: TraceError | string): TraceError {
+  return typeof error === 'string' ? { message: error } : error;
+}
+
+/**
+ * Build the canonical trace for an evaluation case from provider messages and
+ * execution metrics. This is the single projection used by result JSONL,
+ * code-grader stdin, `outputs/answer.md`, and `outputs/transcript.jsonl`.
+ */
+export function buildTraceFromMessages(options: BuildTraceOptions = {}): Trace {
+  const messages = buildTraceMessages(options.input, options.output);
+  const computed = computeTraceSummary(messages);
+  const summary = options.summary ?? computed.trace;
+  const events: TraceEvent[] = [];
+  let ordinal = 0;
+
+  for (const [messageIndex, message] of messages.entries()) {
+    const eventId = `message-${messageIndex}`;
+    events.push({
+      eventId,
+      ordinal: ordinal++,
+      type: 'message',
+      timestamp: message.startTime,
+      durationMs: message.durationMs,
+      message: toTraceMessage(message),
+      metadata: { message_index: messageIndex },
+    });
+
+    for (const [toolIndex, toolCall] of (message.toolCalls ?? []).entries()) {
+      const toolEventId = `message-${messageIndex}-tool-${toolIndex}`;
+      events.push({
+        eventId: toolEventId,
+        parentEventId: eventId,
+        ordinal: ordinal++,
+        type: 'tool_call',
+        timestamp: toolCall.startTime,
+        durationMs: toolCall.durationMs,
+        tool: {
+          name: toolCall.tool,
+          callId: toolCall.id,
+          input: toolCall.input,
+          output: toolCall.output,
+          status: 'ok',
+        },
+        metadata: {
+          message_index: messageIndex,
+          tool_index: toolIndex,
+        },
+      });
+
+      if (toolCall.output !== undefined) {
+        events.push({
+          eventId: `${toolEventId}-result`,
+          parentEventId: toolEventId,
+          ordinal: ordinal++,
+          type: 'tool_result',
+          timestamp: toolCall.endTime,
+          tool: {
+            name: toolCall.tool,
+            callId: toolCall.id,
+            output: toolCall.output,
+            status: 'ok',
+          },
+          metadata: {
+            message_index: messageIndex,
+            tool_index: toolIndex,
+          },
+        });
+      }
+    }
+  }
+
+  const finalAssistantIndex = [...messages]
+    .map((message, index) => ({ message, index }))
+    .reverse()
+    .find((entry) => entry.message.role === 'assistant')?.index;
+  if (finalAssistantIndex !== undefined) {
+    const finalMessage = messages[finalAssistantIndex];
+    events.push({
+      eventId: 'final-response',
+      parentEventId: `message-${finalAssistantIndex}`,
+      ordinal: ordinal++,
+      type: 'final_response',
+      timestamp: finalMessage.endTime ?? finalMessage.startTime ?? options.endTime,
+      message: {
+        ...toTraceMessage(finalMessage),
+        content: options.finalOutput ?? finalMessage.content,
+      },
+      metadata: { message_index: finalAssistantIndex },
+    });
+  }
+
+  if (options.error) {
+    events.push({
+      eventId: 'error',
+      ordinal: ordinal++,
+      type: 'error',
+      timestamp: options.endTime,
+      error: toTraceError(options.error),
+    });
+  }
+
+  return {
+    schemaVersion: TRACE_SCHEMA_VERSION,
+    eventCount: summary.eventCount,
+    toolCalls: summary.toolCalls,
+    errorCount: summary.errorCount + (options.error ? 1 : 0),
+    llmCallCount: summary.llmCallCount,
+    ...(summary.toolDurations ? { toolDurations: summary.toolDurations } : {}),
+    messages,
+    events,
+    tokenUsage: options.tokenUsage,
+    costUsd: options.costUsd,
+    durationMs: options.durationMs,
+    startTime: options.startTime ?? computed.startTime,
+    endTime: options.endTime ?? computed.endTime,
+    metadata: {
+      ...(options.provider ? { provider: options.provider } : {}),
+      ...(options.target ? { target: options.target } : {}),
+      ...(options.testId ? { eval_case_id: options.testId } : {}),
+      ...(options.conversationId ? { provider_session_id: options.conversationId } : {}),
+      ...options.metadata,
+    },
+  };
+}
+
+/**
+ * Return a copy of an existing trace with an appended structured error event.
+ */
+export function appendErrorEventToTrace(
+  trace: Trace,
+  error: TraceError | string,
+  metadata?: Readonly<Record<string, unknown>>,
+): Trace {
+  return {
+    ...trace,
+    errorCount: trace.errorCount + 1,
+    events: [
+      ...trace.events,
+      {
+        eventId: `error-${trace.events.length}`,
+        ordinal: trace.events.length,
+        type: 'error',
+        timestamp: trace.endTime,
+        error: toTraceError(error),
+        metadata,
+      },
+    ],
+  };
+}
+
 /**
  * Combined result of trace computation + execution metrics merge.
  * Returned by computeTraceSummaryWithMetrics().
@@ -793,9 +1133,7 @@ export function computeTraceSummary(messages: readonly MessageLike[]): TraceComp
  * source also carries explicit `branch.includedEventIds`, honor it here so
  * branchable transcripts cannot accidentally grade omitted alternatives.
  */
-export function getSelectedTrajectoryEvents(
-  trajectory: NormalizedTrajectory,
-): readonly NormalizedTraceEvent[] {
+export function getSelectedTrajectoryEvents(trajectory: TraceArtifact): readonly TraceEvent[] {
   if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) {
     return trajectory.events;
   }
@@ -809,16 +1147,14 @@ export function getSelectedTrajectoryEvents(
  *
  * This is the canonical bridge from the high-fidelity trajectory contract to the
  * backward-compatible summary/read model. Keep the projection one-way: importers
- * and replay should preserve NormalizedTrajectory, while existing result readers
+ * and replay should preserve TraceArtifact or Trace, while existing result readers
  * can continue consuming the derived TraceSummary shape unchanged.
  *
  * The summary keeps the current lightweight contract: eventCount is the number
  * of tool-call events, toolCalls is counted by tool name, toolDurations carries
  * per-tool milliseconds when present, and llmCallCount counts model turns.
  */
-export function computeTraceSummaryFromTrajectory(
-  trajectory: NormalizedTrajectory,
-): TraceComputeResult {
+export function computeTraceSummaryFromTrajectory(trajectory: TraceArtifact): TraceComputeResult {
   const selectedEvents = getSelectedTrajectoryEvents(trajectory);
   const hasModelTurnEvents = selectedEvents.some((event) => event.type === 'model_turn');
   const toolCallCounts: Record<string, number> = {};
@@ -896,7 +1232,7 @@ function deriveEventEnd(start: Date | undefined, durationMs: number | undefined)
   return new Date(start.getTime() + durationMs);
 }
 
-function isErrorToolEvent(event: NormalizedTraceEvent): boolean {
+function isErrorToolEvent(event: TraceEvent): boolean {
   return Boolean(
     event.tool?.error ||
       event.tool?.status === 'error' ||
diff --git a/packages/core/src/import/index.ts b/packages/core/src/import/index.ts
index 4170f4128..5f585e9a9 100644
--- a/packages/core/src/import/index.ts
+++ b/packages/core/src/import/index.ts
@@ -15,6 +15,8 @@ export {
   groupTranscriptJsonLines,
   readTranscriptFile,
   readTranscriptJsonl,
+  traceFromTranscriptJsonLines,
+  traceToTranscriptJsonLines,
   toTranscriptJsonLines,
   type TranscriptEntry,
   type TranscriptJsonLine,
diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts
index 4a69f5946..2a451c24f 100644
--- a/packages/core/src/import/types.ts
+++ b/packages/core/src/import/types.ts
@@ -18,6 +18,7 @@ import { readFile } from 'node:fs/promises';
 
 import { toCamelCaseDeep, toSnakeCaseDeep } from '../evaluation/case-conversion.js';
 import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js';
+import { buildTraceFromMessages, type Trace } from '../evaluation/trace.js';
 
 /**
  * A parsed transcript: ordered messages plus session metadata (internal camelCase).
@@ -148,6 +149,64 @@ export function toTranscriptJsonLines(
   }));
 }
 
+/**
+ * Convert a canonical evaluation trace to transcript JSONL rows.
+ */
+export function traceToTranscriptJsonLines(
+  trace: Trace,
+  options?: { testId?: string; target?: string },
+): TranscriptJsonLine[] {
+  const provider =
+    (typeof trace.metadata?.provider === 'string' ? trace.metadata.provider : undefined) ??
+    options?.target ??
+    'agentv';
+  const sessionId =
+    (typeof trace.metadata?.provider_session_id === 'string'
+      ? trace.metadata.provider_session_id
+      : undefined) ??
+    (typeof trace.metadata?.eval_case_id === 'string' ? trace.metadata.eval_case_id : undefined) ??
+    options?.testId ??
+    'trace';
+
+  return toTranscriptJsonLines(
+    {
+      messages: [...trace.messages],
+      source: {
+        provider,
+        sessionId,
+        startedAt: trace.startTime,
+      },
+      tokenUsage: trace.tokenUsage,
+      durationMs: trace.durationMs,
+      costUsd: trace.costUsd,
+    },
+    options,
+  );
+}
+
+/**
+ * Reconstruct a canonical trace/messages representation from transcript JSONL
+ * rows. Transcript-aware graders can use this for offline replay parity.
+ */
+export function traceFromTranscriptJsonLines(lines: readonly TranscriptJsonLine[]): Trace {
+  const [entry] = groupTranscriptJsonLines(lines);
+  if (!entry) {
+    return buildTraceFromMessages();
+  }
+
+  return buildTraceFromMessages({
+    output: entry.messages,
+    tokenUsage: entry.tokenUsage,
+    durationMs: entry.durationMs,
+    costUsd: entry.costUsd ?? undefined,
+    startTime: entry.source.startedAt,
+    provider: entry.source.provider,
+    target: entry.target,
+    testId: entry.testId,
+    conversationId: entry.source.sessionId,
+  });
+}
+
 function buildReplayMessage(line: TranscriptJsonLine): Message {
   const camelCased = toCamelCaseDeep(line) as {
     role: string;
diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts
index 5ee52a54c..f0abe2c44 100644
--- a/packages/eval/src/index.ts
+++ b/packages/eval/src/index.ts
@@ -9,10 +9,9 @@
  * import { defineAssertion } from '@agentv/eval';
  *
  * export default defineAssertion(({ output, criteria }) => {
- *   const text = output?.map(m => String(m.content ?? '')).join(' ') ?? '';
  *   return {
- *     pass: text.includes('hello'),
- *     assertions: [{ text: 'Checks greeting', passed: text.includes('hello') }],
+ *     pass: output.includes('hello'),
+ *     assertions: [{ text: 'Checks greeting', passed: output.includes('hello') }],
  *   };
  * }));
  * ```
@@ -23,7 +22,6 @@
  * import { defineCodeGrader } from '@agentv/eval';
  *
  * export default defineCodeGrader(({ trace, output }) => {
- *   const text = output?.map(m => String(m.content ?? '')).join(' ') ?? '';
  *   return {
  *     score: trace?.eventCount <= 5 ? 1.0 : 0.5,
  *     assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }],
@@ -43,7 +41,25 @@ export {
   NORMALIZED_TRACE_EVENT_TYPES,
   NORMALIZED_TRACE_SOURCE_KINDS,
   NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
+  TRACE_REDACTION_LEVELS,
+  TRACE_SCHEMA_VERSION,
+  TRACE_SOURCE_KINDS,
+  TRACE_EVENT_TYPES,
+  TRACE_TOOL_STATUSES,
   TraceSummarySchema,
+  TraceSchema,
+  TraceArtifactSchema,
+  TraceRawEvidenceSchema,
+  TraceRedactionStateSchema,
+  TraceBranchSchema,
+  TraceErrorSchema,
+  TraceEventSchema,
+  TraceMessageSchema,
+  TraceModelSchema,
+  TraceSessionSchema,
+  TraceSourceRefSchema,
+  TraceSourceSchema,
+  TraceToolSchema,
   NormalizedRawEvidenceSchema,
   NormalizedRedactionStateSchema,
   NormalizedTraceBranchSchema,
@@ -66,6 +82,18 @@ export {
   ContentSchema,
   type CodeGraderInput,
   type CodeGraderResult,
+  type TraceArtifact,
+  type TraceRawEvidence,
+  type TraceRedactionState,
+  type TraceBranch,
+  type TraceError,
+  type TraceEvent,
+  type TraceMessage,
+  type TraceModel,
+  type TraceSession,
+  type TraceSource,
+  type TraceSourceRef,
+  type TraceTool,
   type NormalizedRawEvidence,
   type NormalizedRedactionState,
   type NormalizedTraceBranch,
@@ -79,6 +107,7 @@ export {
   type NormalizedTraceTool,
   type NormalizedTrajectory,
   type TraceSummary,
+  type Trace,
   type Message,
   type ToolCall,
   type TokenUsage,
diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts
index ea454ba67..3200cdc38 100644
--- a/packages/eval/src/schemas.ts
+++ b/packages/eval/src/schemas.ts
@@ -35,9 +35,8 @@ export const TokenUsageSchema = z.object({
 /**
  * Derived trace summary schema (camelCase for TypeScript ergonomics).
  *
- * This is a compatibility/read model for existing code graders and result
- * artifacts. Full trace state should use NormalizedTrajectory and project into
- * this shape only at result or grader-compatibility boundaries.
+ * This is a compact read model for metric-style graders. Full transcript/tool
+ * evidence lives in the canonical `Trace` under `messages` and `events`.
  */
 export const TraceSummarySchema = z.object({
   eventCount: z.number(),
@@ -64,12 +63,20 @@ export const NORMALIZED_TRACE_EVENT_TYPES = [
   'model_turn',
   'tool_call',
   'tool_result',
+  'final_response',
+  'error',
 ] as const;
 
 export const NORMALIZED_TOOL_STATUSES = ['ok', 'error', 'timeout', 'cancelled', 'unknown'] as const;
 
 export const NORMALIZED_REDACTION_LEVELS = ['none', 'partial', 'full'] as const;
 
+export const TRACE_SCHEMA_VERSION = NORMALIZED_TRAJECTORY_SCHEMA_VERSION;
+export const TRACE_SOURCE_KINDS = NORMALIZED_TRACE_SOURCE_KINDS;
+export const TRACE_EVENT_TYPES = NORMALIZED_TRACE_EVENT_TYPES;
+export const TRACE_TOOL_STATUSES = NORMALIZED_TOOL_STATUSES;
+export const TRACE_REDACTION_LEVELS = NORMALIZED_REDACTION_LEVELS;
+
 const MetadataSchema = z.record(z.string(), z.unknown());
 
 export const NormalizedRedactionStateSchema = z.object({
@@ -173,6 +180,7 @@ export const NormalizedTraceEventSchema = z.object({
   message: NormalizedTraceMessageSchema.optional(),
   model: NormalizedTraceModelSchema.optional(),
   tool: NormalizedTraceToolSchema.optional(),
+  error: NormalizedTraceErrorSchema.optional(),
   sourceRef: NormalizedTraceSourceRefSchema.optional(),
   rawEvidence: z.array(NormalizedRawEvidenceSchema).optional(),
   redaction: NormalizedRedactionStateSchema.optional(),
@@ -200,6 +208,19 @@ export const NormalizedTrajectorySchema = z.object({
   metadata: MetadataSchema.optional(),
 });
 
+export const TraceRedactionStateSchema = NormalizedRedactionStateSchema;
+export const TraceErrorSchema = NormalizedTraceErrorSchema;
+export const TraceSourceSchema = NormalizedTraceSourceSchema;
+export const TraceSessionSchema = NormalizedTraceSessionSchema;
+export const TraceBranchSchema = NormalizedTraceBranchSchema;
+export const TraceSourceRefSchema = NormalizedTraceSourceRefSchema;
+export const TraceRawEvidenceSchema = NormalizedRawEvidenceSchema;
+export const TraceMessageSchema = NormalizedTraceMessageSchema;
+export const TraceModelSchema = NormalizedTraceModelSchema;
+export const TraceToolSchema = NormalizedTraceToolSchema;
+export const TraceEventSchema = NormalizedTraceEventSchema;
+export const TraceArtifactSchema = NormalizedTrajectorySchema;
+
 /**
  * Tool call schema.
  */
@@ -269,22 +290,45 @@ export const MessageSchema = z.object({
   metadata: z.record(z.unknown()).optional(),
 });
 
+/**
+ * Canonical evaluation trace exposed to custom graders.
+ *
+ * Top-level summary fields (`eventCount`, `toolCalls`, `errorCount`) remain
+ * available for existing metric graders; full transcript/tool evidence is under
+ * `messages` and structured execution events under `events`.
+ */
+export const TraceSchema = TraceSummarySchema.extend({
+  schemaVersion: z.literal(TRACE_SCHEMA_VERSION),
+  messages: z.array(MessageSchema),
+  events: z.array(TraceEventSchema),
+  tokenUsage: TokenUsageSchema.optional(),
+  costUsd: z.number().optional(),
+  durationMs: z.number().optional(),
+  startTime: z.string().optional(),
+  endTime: z.string().optional(),
+  metadata: MetadataSchema.optional(),
+});
+
 /**
  * Code grader input schema (camelCase, converted from snake_case wire format).
  *
- * Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`.
- * To extract plain text from message content, use `getTextContent()` from `@agentv/core`.
+ * `output` is the final answer/scored result only. Transcript-aware graders
+ * should inspect `messages`, `trace.messages`, or `trace.events`.
  */
 export const CodeGraderInputSchema = z.object({
   criteria: z.string(),
   expectedOutput: z.array(MessageSchema),
-  output: z.array(MessageSchema).nullable().optional(),
+  output: z.string().nullable().optional(),
+  /** Deprecated migration alias; same value as output for text agents. */
+  answer: z.string().optional(),
+  messages: z.array(MessageSchema).optional().default([]),
   /** Path to a temp file containing the output JSON (used for large payloads). */
   outputPath: z.string().optional(),
   inputFiles: z.array(z.string()),
   input: z.array(MessageSchema),
   metadata: z.record(z.unknown()).nullable().optional(),
-  trace: TraceSummarySchema.nullable().optional(),
+  trace: TraceSchema.nullable().optional(),
+  traceSummary: TraceSummarySchema.nullable().optional(),
   tokenUsage: TokenUsageSchema.nullable().optional(),
   costUsd: z.number().nullable().optional(),
   durationMs: z.number().nullable().optional(),
@@ -321,18 +365,43 @@ export type CodeGraderInput = z.infer<typeof CodeGraderInputSchema>;
 export type CodeGraderResult = z.infer<typeof CodeGraderResultSchema>;
 
 export type TraceSummary = z.infer<typeof TraceSummarySchema>;
-export type NormalizedTrajectory = z.infer<typeof NormalizedTrajectorySchema>;
-export type NormalizedTraceSource = z.infer<typeof NormalizedTraceSourceSchema>;
-export type NormalizedTraceSession = z.infer<typeof NormalizedTraceSessionSchema>;
-export type NormalizedTraceBranch = z.infer<typeof NormalizedTraceBranchSchema>;
-export type NormalizedTraceEvent = z.infer<typeof NormalizedTraceEventSchema>;
-export type NormalizedTraceMessage = z.infer<typeof NormalizedTraceMessageSchema>;
-export type NormalizedTraceModel = z.infer<typeof NormalizedTraceModelSchema>;
-export type NormalizedTraceTool = z.infer<typeof NormalizedTraceToolSchema>;
-export type NormalizedTraceError = z.infer<typeof NormalizedTraceErrorSchema>;
-export type NormalizedTraceSourceRef = z.infer<typeof NormalizedTraceSourceRefSchema>;
-export type NormalizedRawEvidence = z.infer<typeof NormalizedRawEvidenceSchema>;
-export type NormalizedRedactionState = z.infer<typeof NormalizedRedactionStateSchema>;
+export type Trace = z.infer<typeof TraceSchema>;
+export type TraceArtifact = z.infer<typeof TraceArtifactSchema>;
+export type TraceSource = z.infer<typeof TraceSourceSchema>;
+export type TraceSession = z.infer<typeof TraceSessionSchema>;
+export type TraceBranch = z.infer<typeof TraceBranchSchema>;
+export type TraceEvent = z.infer<typeof TraceEventSchema>;
+export type TraceMessage = z.infer<typeof TraceMessageSchema>;
+export type TraceModel = z.infer<typeof TraceModelSchema>;
+export type TraceTool = z.infer<typeof TraceToolSchema>;
+export type TraceError = z.infer<typeof TraceErrorSchema>;
+export type TraceSourceRef = z.infer<typeof TraceSourceRefSchema>;
+export type TraceRawEvidence = z.infer<typeof TraceRawEvidenceSchema>;
+export type TraceRedactionState = z.infer<typeof TraceRedactionStateSchema>;
+/** @deprecated Use TraceArtifact for legacy import/replay artifacts or Trace for evaluation results. */
+export type NormalizedTrajectory = TraceArtifact;
+/** @deprecated Use TraceSource. */
+export type NormalizedTraceSource = TraceSource;
+/** @deprecated Use TraceSession. */
+export type NormalizedTraceSession = TraceSession;
+/** @deprecated Use TraceBranch. */
+export type NormalizedTraceBranch = TraceBranch;
+/** @deprecated Use TraceEvent. */
+export type NormalizedTraceEvent = TraceEvent;
+/** @deprecated Use TraceMessage. */
+export type NormalizedTraceMessage = TraceMessage;
+/** @deprecated Use TraceModel. */
+export type NormalizedTraceModel = TraceModel;
+/** @deprecated Use TraceTool. */
+export type NormalizedTraceTool = TraceTool;
+/** @deprecated Use TraceError. */
+export type NormalizedTraceError = TraceError;
+/** @deprecated Use TraceSourceRef. */
+export type NormalizedTraceSourceRef = TraceSourceRef;
+/** @deprecated Use TraceRawEvidence. */
+export type NormalizedRawEvidence = TraceRawEvidence;
+/** @deprecated Use TraceRedactionState. */
+export type NormalizedRedactionState = TraceRedactionState;
 export type Message = z.infer<typeof MessageSchema>;
 export type ToolCall = z.infer<typeof ToolCallSchema>;
 export type TokenUsage = z.infer<typeof TokenUsageSchema>;

From 31e25b0ad6d8f3e5b79b3d5680d00b3283ba1d68 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 07:25:03 +0200
Subject: [PATCH 2/7] feat(evaluation): score final output with full trace

---
 apps/cli/src/commands/eval/commands/assert.ts |  17 ++-
 apps/cli/src/commands/eval/run-eval.ts        |  50 +++----
 apps/cli/src/commands/inspect/score.ts        |  23 ++-
 apps/cli/src/commands/inspect/show.ts         |   4 +-
 apps/cli/src/commands/inspect/utils.ts        |   3 +-
 apps/cli/src/commands/pipeline/grade.ts       |  14 +-
 apps/cli/src/commands/results/show.ts         |   4 +-
 .../src/evaluation/graders/code-grader.ts     |  34 ++++-
 packages/core/src/evaluation/graders/types.ts |   6 +-
 packages/core/src/evaluation/orchestrator.ts  | 131 +++++++++++++++---
 packages/core/src/evaluation/types.ts         |  10 +-
 .../core/src/observability/otel-exporter.ts   |  20 ++-
 12 files changed, 232 insertions(+), 84 deletions(-)

diff --git a/apps/cli/src/commands/eval/commands/assert.ts b/apps/cli/src/commands/eval/commands/assert.ts
index 519fbfc84..c50e20e5c 100644
--- a/apps/cli/src/commands/eval/commands/assert.ts
+++ b/apps/cli/src/commands/eval/commands/assert.ts
@@ -3,7 +3,7 @@ import path from 'node:path';
 import { command, option, optional, positional, string } from 'cmd-ts';
 import fg from 'fast-glob';
 
-import { executeScript } from '@agentv/core';
+import { buildTraceFromMessages, executeScript } from '@agentv/core';
 
 export const evalAssertCommand = command({
   name: 'assert',
@@ -64,17 +64,26 @@ export const evalAssertCommand = command({
 
     // Build payload matching CodeGrader's expected format (snake_case).
     // Include all fields that defineCodeGrader validates as required.
+    const messages = [{ role: 'assistant' as const, content: resolvedOutput }];
+    const inputMessages = [{ role: 'user' as const, content: resolvedInput }];
+    const trace = buildTraceFromMessages({
+      input: inputMessages,
+      output: messages,
+      finalOutput: resolvedOutput,
+    });
     const payload = JSON.stringify(
       {
-        output: [{ role: 'assistant', content: resolvedOutput }],
-        input: [{ role: 'user', content: resolvedInput }],
+        output: resolvedOutput,
+        answer: resolvedOutput,
+        messages,
+        input: inputMessages,
         question: resolvedInput,
         criteria: '',
         expected_output: [],
         reference_answer: '',
 
         input_files: [],
-        trace: null,
+        trace,
         token_usage: null,
         cost_usd: null,
         duration_ms: null,
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index a1d31c6bb..623045d7c 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -15,6 +15,7 @@ import {
   ResponseCache,
   RunBudgetTracker,
   type TrialsConfig,
+  buildTraceFromMessages,
   runEvaluation as defaultRunEvaluation,
   deriveCategory,
   ensureVSCodeSubagents,
@@ -313,32 +314,15 @@ function normalizeOutputMessages(cliValue: string | undefined): number | 'all' {
 }
 
 /**
- * Trim output messages for results JSONL.
- * Each message is stripped to { role, content } only.
- *
- * - `1` (default): last assistant message only (legacy behavior)
- * - `N`: last N messages (any role)
- * - `'all'`: all messages
+ * Deprecated compatibility hook for the old output-as-messages JSONL surface.
+ * Result `output` is now the final answer string; full transcript data stays
+ * under `trace.messages` and is intentionally not trimmed here.
  */
 export function trimOutputMessages(
   output: EvaluationResult['output'],
-  outputMessages: number | 'all',
+  _outputMessages: number | 'all',
 ): EvaluationResult['output'] {
-  const messages = output ?? [];
-
-  if (outputMessages === 'all') {
-    return messages.map((m) => ({ role: m.role, content: m.content }));
-  }
-
-  if (outputMessages === 1) {
-    // Legacy behavior: last assistant message only
-    const lastAssistant = messages.filter((m) => m.role === 'assistant').at(-1);
-    return lastAssistant ? [{ role: lastAssistant.role, content: lastAssistant.content }] : [];
-  }
-
-  // Last N messages (any role), trimmed to { role, content }
-  const sliced = messages.slice(-outputMessages);
-  return sliced.map((m) => ({ role: m.role, content: m.content }));
+  return output;
 }
 
 function normalizeOptions(
@@ -1576,7 +1560,16 @@ export async function runEvalCommand(
             testId: testCase.id,
             score: 0,
             assertions: [],
-            output: [],
+            output: budgetMsg,
+            trace: buildTraceFromMessages({
+              input: testCase.input as EvaluationResult['input'],
+              output: [{ role: 'assistant' as const, content: budgetMsg }],
+              finalOutput: budgetMsg,
+              target: selection.targetName,
+              testId: testCase.id,
+              conversationId: testCase.conversation_id,
+              error: budgetMsg,
+            }),
             error: budgetMsg,
             budgetExceeded: true,
             executionStatus: 'execution_error' as const,
@@ -1680,7 +1673,16 @@ export async function runEvalCommand(
                   testId: testCase.id,
                   score: 0,
                   assertions: [],
-                  output: [],
+                  output: message,
+                  trace: buildTraceFromMessages({
+                    input: testCase.input as EvaluationResult['input'],
+                    output: [{ role: 'assistant' as const, content: message }],
+                    finalOutput: message,
+                    target: selection.targetName,
+                    testId: testCase.id,
+                    conversationId: testCase.conversation_id,
+                    error: message,
+                  }),
                   scores: [],
                   error: message,
                   executionStatus: 'execution_error' as const,
diff --git a/apps/cli/src/commands/inspect/score.ts b/apps/cli/src/commands/inspect/score.ts
index 3abdc9ca8..75244e827 100644
--- a/apps/cli/src/commands/inspect/score.ts
+++ b/apps/cli/src/commands/inspect/score.ts
@@ -9,6 +9,7 @@ import {
   type Provider,
   type ProviderRequest,
   type ProviderResponse,
+  buildTraceFromMessages,
   createBuiltinRegistry,
   toCamelCaseDeep,
 } from '@agentv/core';
@@ -205,9 +206,25 @@ async function runScore(
   for (const raw of results) {
     if (testIdFilter && raw.test_id !== testIdFilter) continue;
 
-    const trace = toTraceSummary(raw);
     const candidate = extractCandidate(raw);
-    const output = raw.output as readonly Message[] | undefined;
+    const output =
+      (raw.trace as { messages?: unknown } | undefined)?.messages ??
+      (Array.isArray(raw.output) ? raw.output : undefined);
+    const outputMessages = Array.isArray(output)
+      ? (toCamelCaseDeep(output) as readonly Message[])
+      : undefined;
+    const trace =
+      raw.trace &&
+      Array.isArray((raw.trace as { messages?: unknown }).messages) &&
+      Array.isArray((raw.trace as { events?: unknown }).events)
+        ? (toCamelCaseDeep(raw.trace) as EvaluationContext['trace'])
+        : buildTraceFromMessages({
+            output: outputMessages,
+            finalOutput: candidate,
+            summary: toTraceSummary(raw),
+            target: raw.target,
+            testId: raw.test_id,
+          });
 
     const evalContext: EvaluationContext = {
       evalCase: buildTestCase(raw),
@@ -217,7 +234,7 @@ async function runScore(
       attempt: 1,
       promptInputs: { question: '' },
       now: new Date(),
-      output: Array.isArray(output) ? output : undefined,
+      output: outputMessages,
       trace,
       tokenUsage: raw.token_usage
         ? (toCamelCaseDeep(raw.token_usage) as EvaluationContext['tokenUsage'])
diff --git a/apps/cli/src/commands/inspect/show.ts b/apps/cli/src/commands/inspect/show.ts
index c738a4aad..3925ee172 100644
--- a/apps/cli/src/commands/inspect/show.ts
+++ b/apps/cli/src/commands/inspect/show.ts
@@ -82,7 +82,9 @@ interface RawToolCall {
  * Shows a hierarchical trace: LLM calls → tool calls.
  */
 function renderTree(result: RawResult): string {
-  const messages = result.output as RawMessage[] | undefined;
+  const messages =
+    (result.trace as { messages?: RawMessage[] } | undefined)?.messages ??
+    (Array.isArray(result.output) ? (result.output as RawMessage[]) : undefined);
   const spans = getTraceSpans(result);
 
   if (!messages || messages.length === 0) {
diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts
index 1b9cbd30b..25399ed20 100644
--- a/apps/cli/src/commands/inspect/utils.ts
+++ b/apps/cli/src/commands/inspect/utils.ts
@@ -1,7 +1,7 @@
 import { readFileSync, readdirSync, statSync } from 'node:fs';
 import path from 'node:path';
 import type { EvaluationResult, TraceSummary } from '@agentv/core';
-import { DEFAULT_THRESHOLD, toCamelCaseDeep } from '@agentv/core';
+import { DEFAULT_THRESHOLD, toCamelCaseDeep, toSnakeCaseDeep } from '@agentv/core';
 import {
   RESULT_INDEX_FILENAME,
   RESULT_RUNS_DIRNAME,
@@ -183,6 +183,7 @@ function toRawResult(result: EvaluationResult): RawResult {
     end_time: result.endTime,
     input: result.input,
     output: result.output,
+    trace: toSnakeCaseDeep(result.trace) as RawTraceSummary,
     file_changes: result.fileChanges,
   };
 }
diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
index d262c0aa0..906987245 100644
--- a/apps/cli/src/commands/pipeline/grade.ts
+++ b/apps/cli/src/commands/pipeline/grade.ts
@@ -17,6 +17,7 @@ import { join } from 'node:path';
 
 import {
   type AssertionResult,
+  buildTraceFromMessages,
   executeScript,
   runContainsAllAssertion,
   runContainsAnyAssertion,
@@ -107,13 +108,22 @@ export async function runCodeGraders(
     const { testId, resultsDir, responseText, inputData } = task;
     const graderName = graderConfig.name as string;
     const inputText = extractInputText(inputData.input);
+    const messages = [{ role: 'assistant' as const, content: responseText }];
+    const trace = buildTraceFromMessages({
+      input: inputData.input,
+      output: messages,
+      finalOutput: responseText,
+      testId,
+    });
     const payload = JSON.stringify({
-      output: [{ role: 'assistant', content: responseText }],
+      output: responseText,
+      answer: responseText,
+      messages,
       input: inputData.input,
       criteria: '',
       expected_output: [],
       input_files: inputData.input_files ?? [],
-      trace: null,
+      trace,
       token_usage: null,
       cost_usd: null,
       duration_ms: null,
diff --git a/apps/cli/src/commands/results/show.ts b/apps/cli/src/commands/results/show.ts
index 52b8b9cfa..8c1926d8c 100644
--- a/apps/cli/src/commands/results/show.ts
+++ b/apps/cli/src/commands/results/show.ts
@@ -35,9 +35,7 @@ function formatInput(result: EvaluationResult): string {
 
 function formatOutput(result: EvaluationResult): string {
   if (!result.output || result.output.length === 0) return '(no output)';
-  return result.output
-    .map((msg) => String((msg as unknown as Record<string, unknown>).content ?? ''))
-    .join('\n');
+  return result.output;
 }
 
 // ── Formatting ───────────────────────────────────────────────────────────
diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts
index 3ec89061b..e58c1b438 100644
--- a/packages/core/src/evaluation/graders/code-grader.ts
+++ b/packages/core/src/evaluation/graders/code-grader.ts
@@ -134,17 +134,19 @@ export class CodeGrader implements Grader {
       return imageTmpDir;
     };
 
-    // Materialize multimodal content (data URIs → temp files, source → path)
-    const materializedOutput = await materializeContentForGrader(
-      context.output as readonly Record<string, unknown>[] | undefined,
+    const transcriptMessages = context.trace?.messages ?? context.output ?? [];
+
+    // Materialize transcript multimodal content (data URIs → temp files, source → path)
+    const materializedMessages = await materializeContentForGrader(
+      transcriptMessages as unknown as readonly Record<string, unknown>[] | undefined,
       getImageDir,
     );
 
-    // Determine whether to use file-backed output for large payloads
-    let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
+    // Determine whether to use file-backed output for large final answers
+    let outputForPayload: string | null = context.candidate;
     let outputPath: string | undefined;
 
-    if (outputForPayload) {
+    if (outputForPayload !== null) {
       const serialized = JSON.stringify(outputForPayload);
       if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
         const tmpDir = await mkdtemp(join(tmpdir(), 'agentv-grader-'));
@@ -154,6 +156,13 @@ export class CodeGrader implements Grader {
       }
     }
 
+    const traceForPayload = context.trace
+      ? {
+          ...context.trace,
+          messages: materializedMessages ?? context.trace.messages,
+        }
+      : null;
+
     // Build payload (camelCase internally, converted to snake_case for graders)
     const payload = {
       criteria: context.evalCase.criteria,
@@ -162,6 +171,8 @@ export class CodeGrader implements Grader {
         getImageDir,
       ),
       output: outputForPayload,
+      answer: context.candidate,
+      messages: materializedMessages ?? [],
       outputPath,
       inputFiles: context.evalCase.file_paths,
       input: await materializeContentForGrader(
@@ -169,7 +180,16 @@ export class CodeGrader implements Grader {
         getImageDir,
       ),
       metadata: context.evalCase.metadata ?? null,
-      trace: context.trace ?? null,
+      trace: traceForPayload,
+      traceSummary: context.trace
+        ? {
+            eventCount: context.trace.eventCount,
+            toolCalls: context.trace.toolCalls,
+            errorCount: context.trace.errorCount,
+            toolDurations: context.trace.toolDurations,
+            llmCallCount: context.trace.llmCallCount,
+          }
+        : null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,
       durationMs: context.durationMs ?? null,
diff --git a/packages/core/src/evaluation/graders/types.ts b/packages/core/src/evaluation/graders/types.ts
index 1d548e5f9..c1376e5e8 100644
--- a/packages/core/src/evaluation/graders/types.ts
+++ b/packages/core/src/evaluation/graders/types.ts
@@ -1,6 +1,6 @@
 import type { ResolvedTarget } from '../providers/targets.js';
 import type { ChatPrompt, Message, Provider } from '../providers/types.js';
-import type { TokenUsage, TraceSummary } from '../trace.js';
+import type { TokenUsage, Trace } from '../trace.js';
 import type {
   DependencyResult,
   DockerWorkspaceConfig,
@@ -37,8 +37,8 @@ export interface EvaluationContext {
   readonly evaluator?: GraderConfig;
   /** Output messages from agent execution (primary source for tool trajectory) */
   readonly output?: readonly Message[];
-  /** Lightweight summary of trace events (if available) */
-  readonly trace?: TraceSummary;
+  /** Canonical execution trace with messages, events, metrics, and provenance. */
+  readonly trace?: Trace;
   /** Token usage from provider execution (promoted from TraceSummary) */
   readonly tokenUsage?: TokenUsage;
   /** Total cost in USD (from provider) */
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index e3da97d6e..54e38fcd9 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -46,7 +46,10 @@ import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './re
 import type { RunBudgetTracker } from './run-budget-tracker.js';
 import {
   type TokenUsage,
+  type Trace,
   type TraceSummary,
+  appendErrorEventToTrace,
+  buildTraceFromMessages,
   computeTraceSummary,
   mergeExecutionMetrics,
 } from './trace.js';
@@ -1130,10 +1133,9 @@ export async function runEvaluation(
 
     // Helper: build a DependencyResult from a completed EvaluationResult
     function toDependencyResult(r: EvaluationResult): DependencyResult {
-      const outputText = extractLastAssistantContent(r.output);
       return {
         score: r.score,
-        output: outputText,
+        output: r.output,
         workspace_path: r.workspacePath,
         details: r.scores
           ? (Object.fromEntries(
@@ -1196,6 +1198,7 @@ export async function runEvaluation(
       // eval files/targets in the current CLI invocation, so queued cases stop once
       // cumulative spend reaches the cap while already-running cases are allowed to finish.
       if (runBudgetTracker?.isExceeded()) {
+        const errorMessage = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
         const budgetResult: EvaluationResult = {
           timestamp: (now ?? (() => new Date()))().toISOString(),
           testId: evalCase.id,
@@ -1203,15 +1206,24 @@ export async function runEvaluation(
           category: evalCase.category,
           score: 0,
           assertions: [],
-          output: [],
+          output: errorMessage,
+          trace: buildTraceFromMessages({
+            input: evalCase.input as readonly Message[],
+            output: [{ role: 'assistant' as const, content: errorMessage }],
+            finalOutput: errorMessage,
+            target: target.name,
+            testId: evalCase.id,
+            conversationId: evalCase.conversation_id,
+            error: errorMessage,
+          }),
           target: target.name,
-          error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
+          error: errorMessage,
           budgetExceeded: true,
           executionStatus: 'execution_error',
           failureStage: 'setup',
           failureReasonCode: 'budget_exceeded',
           executionError: {
-            message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
+            message: errorMessage,
             stage: 'setup',
           },
         };
@@ -1235,6 +1247,7 @@ export async function runEvaluation(
 
       // Check suite-level budget before dispatching
       if (budgetUsd !== undefined && budgetExhausted) {
+        const errorMessage = `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`;
         const budgetResult: EvaluationResult = {
           timestamp: (now ?? (() => new Date()))().toISOString(),
           testId: evalCase.id,
@@ -1242,15 +1255,24 @@ export async function runEvaluation(
           category: evalCase.category,
           score: 0,
           assertions: [],
-          output: [],
+          output: errorMessage,
+          trace: buildTraceFromMessages({
+            input: evalCase.input as readonly Message[],
+            output: [{ role: 'assistant' as const, content: errorMessage }],
+            finalOutput: errorMessage,
+            target: target.name,
+            testId: evalCase.id,
+            conversationId: evalCase.conversation_id,
+            error: errorMessage,
+          }),
           target: target.name,
-          error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
+          error: errorMessage,
           budgetExceeded: true,
           executionStatus: 'execution_error',
           failureStage: 'setup',
           failureReasonCode: 'budget_exceeded',
           executionError: {
-            message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
+            message: errorMessage,
             stage: 'setup',
           },
         };
@@ -1282,7 +1304,16 @@ export async function runEvaluation(
           category: evalCase.category,
           score: 0,
           assertions: [],
-          output: [],
+          output: errorMsg,
+          trace: buildTraceFromMessages({
+            input: evalCase.input as readonly Message[],
+            output: [{ role: 'assistant' as const, content: errorMsg }],
+            finalOutput: errorMsg,
+            target: target.name,
+            testId: evalCase.id,
+            conversationId: evalCase.conversation_id,
+            error: errorMsg,
+          }),
           target: target.name,
           error: errorMsg,
           executionStatus: 'execution_error',
@@ -1455,7 +1486,16 @@ export async function runEvaluation(
                   category: evalCase.category,
                   score: 0,
                   assertions: [],
-                  output: [],
+                  output: errorMsg,
+                  trace: buildTraceFromMessages({
+                    input: evalCase.input as readonly Message[],
+                    output: [{ role: 'assistant' as const, content: errorMsg }],
+                    finalOutput: errorMsg,
+                    target: target.name,
+                    testId: evalCase.id,
+                    conversationId: evalCase.conversation_id,
+                    error: errorMsg,
+                  }),
                   target: target.name,
                   error: errorMsg,
                   executionStatus: 'execution_error',
@@ -1756,6 +1796,10 @@ async function runBatchEvaluation(options: {
       if (providerError) {
         result = {
           ...result,
+          trace: appendErrorEventToTrace(result.trace, providerError, {
+            failure_stage: 'agent',
+            failure_reason_code: 'provider_error',
+          }),
           error: providerError,
           executionStatus: 'execution_error' as const,
           failureStage: 'agent' as const,
@@ -2495,6 +2539,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
           ...result,
           ...targetUsedField,
           evalRun,
+          trace: appendErrorEventToTrace(result.trace, providerError, {
+            failure_stage: 'agent',
+            failure_reason_code: 'provider_error',
+          }),
           error: providerError,
           executionStatus,
           failureStage: 'agent' as const,
@@ -2510,6 +2558,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
             ...targetUsedField,
             score: 0,
             evalRun,
+            trace: appendErrorEventToTrace(result.trace, skippedEvaluatorError, {
+              failure_stage: 'evaluator',
+              failure_reason_code: 'evaluator_error',
+            }),
             error: skippedEvaluatorError,
             executionStatus,
             failureStage: 'evaluator' as const,
@@ -2748,6 +2800,24 @@ async function evaluateCandidate(options: {
     dependencyResults,
   } = options;
 
+  const input = buildResultInput(promptInputs);
+  const outputMessages = output ?? [{ role: 'assistant' as const, content: candidate }];
+  const evaluationTrace = buildTraceFromMessages({
+    input,
+    output: outputMessages,
+    summary: trace,
+    finalOutput: candidate,
+    tokenUsage,
+    costUsd,
+    durationMs,
+    startTime,
+    endTime,
+    provider: provider.kind,
+    target: target.name,
+    testId: evalCase.id,
+    conversationId: evalCase.conversation_id,
+  });
+
   const gradeTimestamp = nowFn();
   const { score, scores } = await runEvaluatorsForCase({
     evalCase,
@@ -2762,7 +2832,7 @@ async function evaluateCandidate(options: {
     graderProvider,
     agentTimeoutMs,
     output,
-    trace,
+    trace: evaluationTrace,
     costUsd,
     durationMs,
     tokenUsage,
@@ -2811,8 +2881,6 @@ async function evaluateCandidate(options: {
           ...(evaluatorRequest ? { evaluator: evaluatorRequest } : {}),
         }
       : undefined;
-  const input = buildResultInput(promptInputs);
-
   return {
     timestamp: completedAt.toISOString(),
     testId: evalCase.id,
@@ -2829,9 +2897,9 @@ async function evaluateCandidate(options: {
     endTime,
     requests,
     input,
-    output: output ?? [{ role: 'assistant' as const, content: candidate }],
+    output: candidate,
     scores: scores,
-    trace: trace,
+    trace: evaluationTrace,
     fileChanges,
     executionStatus: classifyQualityStatus(score.score, evalThreshold),
   };
@@ -2850,7 +2918,7 @@ async function runEvaluatorsForCase(options: {
   readonly graderProvider?: Provider;
   readonly agentTimeoutMs?: number;
   readonly output?: readonly Message[];
-  readonly trace?: TraceSummary;
+  readonly trace?: Trace;
   readonly costUsd?: number;
   readonly durationMs?: number;
   readonly tokenUsage?: TokenUsage;
@@ -2993,7 +3061,7 @@ async function runEvaluatorList(options: {
   readonly graderProvider?: Provider;
   readonly agentTimeoutMs?: number;
   readonly output?: readonly Message[];
-  readonly trace?: TraceSummary;
+  readonly trace?: Trace;
   readonly costUsd?: number;
   readonly durationMs?: number;
   readonly tokenUsage?: TokenUsage;
@@ -3519,9 +3587,20 @@ async function runConversationMode(options: {
     role: m.role,
     content: m.content,
   }));
+  const totalDurationMs = Date.now() - caseStartMs;
+  const finalOutput = extractLastAssistantContent(outputMessages);
+  const trace = buildTraceFromMessages({
+    input: evalCase.input as readonly Message[],
+    output: outputMessages,
+    finalOutput,
+    durationMs: totalDurationMs,
+    provider: provider.kind,
+    target: target.name,
+    testId: evalCase.id,
+    conversationId: evalCase.conversation_id,
+  });
 
   const flatAssertions: AssertionEntry[] = allResultScores.flatMap((s) => [...s.assertions]);
-  const totalDurationMs = Date.now() - caseStartMs;
 
   return {
     timestamp: nowFn().toISOString(),
@@ -3531,7 +3610,8 @@ async function runConversationMode(options: {
     score: finalScore,
     assertions: flatAssertions,
     target: target.name,
-    output: outputMessages,
+    output: finalOutput,
+    trace,
     scores: allResultScores,
     executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
     input: evalCase.input.map((m) => ({
@@ -3732,6 +3812,16 @@ function buildErrorResult(
         }
       : undefined;
   const input = buildResultInput(promptInputs);
+  const output = `Error occurred: ${message}`;
+  const trace = buildTraceFromMessages({
+    input,
+    output: [{ role: 'assistant' as const, content: output }],
+    finalOutput: output,
+    target: targetName,
+    testId: evalCase.id,
+    conversationId: evalCase.conversation_id,
+    error: message,
+  });
 
   return {
     timestamp: timestamp.toISOString(),
@@ -3744,7 +3834,8 @@ function buildErrorResult(
     target: targetName,
     requests,
     input,
-    output: [{ role: 'assistant' as const, content: `Error occurred: ${message}` }],
+    output,
+    trace,
     error: message,
     executionStatus: 'execution_error',
     failureStage,
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 7e764ff8b..cb3735296 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -1,4 +1,4 @@
-import type { TokenUsage, ToolTrajectoryGraderConfig, TraceSummary } from './trace.js';
+import type { TokenUsage, ToolTrajectoryGraderConfig, Trace } from './trace.js';
 
 /** A single assertion verdict with optional evidence. */
 export interface AssertionEntry {
@@ -1194,14 +1194,14 @@ export interface EvaluationResult {
   };
   readonly scores?: readonly GraderResult[];
   readonly error?: string;
-  /** Lightweight summary of the execution trace (always included when available) */
-  readonly trace?: TraceSummary;
+  /** Canonical execution trace: messages, events, metrics, and provider provenance. */
+  readonly trace: Trace;
   /** Path to the temporary workspace directory (included on failure for debugging) */
   readonly workspacePath?: string;
   /** Input messages sent to the agent. Always Message[] for consistent shape with output. */
   readonly input?: readonly import('./providers/types.js').Message[];
-  /** Output messages from agent execution. Always present — at minimum contains the final assistant message. */
-  readonly output: readonly import('./providers/types.js').Message[];
+  /** Final answer / scored result only. Full transcript lives in trace.messages/events. */
+  readonly output: string;
   /** Captured output from workspace before_all script */
   readonly beforeAllOutput?: string;
   /** Captured output from workspace before_each script */
diff --git a/packages/core/src/observability/otel-exporter.ts b/packages/core/src/observability/otel-exporter.ts
index 73f1a98b1..d3c400a9a 100644
--- a/packages/core/src/observability/otel-exporter.ts
+++ b/packages/core/src/observability/otel-exporter.ts
@@ -185,10 +185,7 @@ export class OtelTraceExporter {
         if (result.suite) rootSpan.setAttribute('agentv.suite', result.suite);
         rootSpan.setAttribute('agentv.score', result.score);
         if (captureContent && result.output.length > 0) {
-          const lastMsg = result.output[result.output.length - 1];
-          const text =
-            typeof lastMsg.content === 'string' ? lastMsg.content : JSON.stringify(lastMsg.content);
-          rootSpan.setAttribute('agentv.output_text', text);
+          rootSpan.setAttribute('agentv.output_text', result.output);
         }
 
         // Flat execution metrics
@@ -219,12 +216,13 @@ export class OtelTraceExporter {
             rootSpan.setAttribute('agentv.trace.llm_call_count', t.llmCallCount);
         }
 
-        // Child spans from output messages (--trace mode)
-        if (result.output) {
+        // Child spans from trace messages (--trace mode)
+        const traceMessages = result.trace.messages;
+        if (traceMessages.length > 0) {
           const parentCtx = api.trace.setSpan(api.context.active(), rootSpan);
 
           if (this.options.groupTurns) {
-            const turns = groupMessagesIntoTurns(result.output);
+            const turns = groupMessagesIntoTurns(traceMessages);
             if (turns.length > 1) {
               for (const [i, turn] of turns.entries()) {
                 api.context.with(parentCtx, () => {
@@ -244,12 +242,12 @@ export class OtelTraceExporter {
                 });
               }
             } else {
-              for (const msg of result.output) {
+              for (const msg of traceMessages) {
                 this.exportMessage(tracer, api, parentCtx, msg, captureContent);
               }
             }
           } else {
-            for (const msg of result.output) {
+            for (const msg of traceMessages) {
               this.exportMessage(tracer, api, parentCtx, msg, captureContent);
             }
           }
@@ -593,13 +591,13 @@ export class OtelStreamingObserver {
     }
 
     const model =
-      result.output.find((msg) => msg.role === 'assistant')?.metadata?.model ??
+      result.trace.messages.find((msg) => msg.role === 'assistant')?.metadata?.model ??
       result.target ??
       'unknown';
 
     this.onLlmCall(String(model), result.tokenUsage);
 
-    for (const message of result.output) {
+    for (const message of result.trace.messages) {
       for (const toolCall of message.toolCalls ?? []) {
         this.onToolCall(
           toolCall.tool,

From 082298261d571313433731492ee5d8411e11a5a2 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 07:25:25 +0200
Subject: [PATCH 3/7] feat(cli): write answer and transcript artifacts

---
 apps/cli/src/commands/eval/artifact-writer.ts | 156 +++++++++++-------
 apps/cli/src/commands/results/manifest.ts     |  43 +++--
 packages/core/src/import/types.ts             |   2 +-
 3 files changed, 134 insertions(+), 67 deletions(-)

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 8760a3728..8fcb8ed71 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -6,8 +6,12 @@ import {
   type EvalTest,
   type EvaluationResult,
   type GraderResult,
+  type Message,
   type TargetDefinition,
-  toTranscriptJsonLines,
+  type TraceSummary,
+  buildTraceFromMessages,
+  extractLastAssistantContent,
+  traceToTranscriptJsonLines,
 } from '@agentv/core';
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
 import { RESULT_INDEX_FILENAME } from './result-layout.js';
@@ -195,7 +199,10 @@ export interface IndexArtifactEntry {
   readonly grading_path: string;
   readonly timing_path: string;
   readonly output_path?: string;
+  readonly answer_path?: string;
+  readonly transcript_path?: string;
   readonly input_path?: string;
+  /** @deprecated Use output_path/answer_path for the final answer. */
   readonly response_path?: string;
   readonly task_dir?: string;
   readonly eval_path?: string;
@@ -245,23 +252,8 @@ function countToolCalls(result: EvaluationResult): {
   toolCalls: Record<string, number>;
   total: number;
 } {
-  const toolCalls: Record<string, number> = {};
-  let total = 0;
-
-  const trace = result.trace as
-    | { steps?: readonly { toolName?: string; type?: string }[] }
-    | undefined;
-
-  if (trace?.steps) {
-    for (const step of trace.steps) {
-      if (step.toolName || step.type === 'tool') {
-        const name = step.toolName ?? 'unknown';
-        toolCalls[name] = (toolCalls[name] ?? 0) + 1;
-        total += 1;
-      }
-    }
-  }
-
+  const toolCalls = { ...(result.trace?.toolCalls ?? {}) };
+  const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0);
   return { toolCalls, total };
 }
 
@@ -365,9 +357,8 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
     workspace_changes: parseWorkspaceChanges(result.fileChanges),
     conversation: result.conversationId
       ? {
-          turns: result.trace
-            ? ((result.trace as { steps?: readonly unknown[] }).steps?.length ?? 0)
-            : 0,
+          turns:
+            result.trace?.messages.filter((message) => message.role === 'assistant').length ?? 0,
           conversation_id: result.conversationId,
         }
       : undefined,
@@ -661,7 +652,10 @@ export function buildIndexArtifactEntry(
     gradingPath: string;
     timingPath: string;
     outputPath?: string;
+    answerPath?: string;
+    transcriptPath?: string;
     inputPath?: string;
+    responsePath?: string;
     taskBundle?: MaterializedTaskBundlePaths;
   },
 ): IndexArtifactEntry {
@@ -689,9 +683,18 @@ export function buildIndexArtifactEntry(
     output_path: options.outputPath
       ? toRelativeArtifactPath(options.outputDir, options.outputPath)
       : undefined,
+    answer_path: options.answerPath
+      ? toRelativeArtifactPath(options.outputDir, options.answerPath)
+      : undefined,
+    transcript_path: options.transcriptPath
+      ? toRelativeArtifactPath(options.outputDir, options.transcriptPath)
+      : undefined,
     input_path: options.inputPath
       ? toRelativeArtifactPath(options.outputDir, options.inputPath)
       : undefined,
+    response_path: options.responsePath
+      ? toRelativeArtifactPath(options.outputDir, options.responsePath)
+      : undefined,
     ...buildTaskBundleIndexFields(options.outputDir, options.taskBundle),
     metadata: result.metadata,
   };
@@ -703,7 +706,8 @@ export function buildResultIndexArtifact(
 ): ResultIndexArtifact {
   const artifactSubdir = buildArtifactSubdir(result);
   const input = extractInput(result);
-  const hasResponse = Array.isArray(result.output) && result.output.length > 0;
+  const hasAnswer = result.output.length > 0;
+  const hasTranscript = result.trace.messages.length > 0 || result.trace.events.length > 0;
 
   return {
     timestamp: result.timestamp,
@@ -725,10 +729,12 @@ export function buildResultIndexArtifact(
     grading_path: path.posix.join(artifactSubdir, 'grading.json'),
     timing_path: path.posix.join(artifactSubdir, 'timing.json'),
     input_path: input ? path.posix.join(artifactSubdir, 'input.md') : undefined,
-    output_path: hasResponse
-      ? path.posix.join(artifactSubdir, 'outputs', 'response.md')
+    output_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
+    answer_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
+    transcript_path: hasTranscript
+      ? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl')
       : undefined,
-    response_path: hasResponse
+    response_path: hasAnswer
       ? path.posix.join(artifactSubdir, 'outputs', 'response.md')
       : undefined,
     ...(taskBundle
@@ -756,6 +762,16 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr
   await writeFile(filePath, content, 'utf8');
 }
 
+async function writeTranscriptJsonl(filePath: string, result: EvaluationResult): Promise<void> {
+  const lines = traceToTranscriptJsonLines(result.trace, {
+    testId: result.testId,
+    target: result.target,
+  });
+  const content =
+    lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join('\n')}\n` : '';
+  await writeFile(filePath, content, 'utf8');
+}
+
 function isRecord(value: unknown): value is Record<string, unknown> {
   return typeof value === 'object' && value !== null && !Array.isArray(value);
 }
@@ -852,6 +868,7 @@ type ParsedEvaluationResult = Record<string, unknown> & {
   assertions: EvaluationResult['assertions'];
   target: string;
   output: EvaluationResult['output'];
+  trace: EvaluationResult['trace'];
   executionStatus: EvaluationResult['executionStatus'];
 };
 
@@ -874,7 +891,7 @@ function isAssertionEntry(value: unknown): value is EvaluationResult['assertions
   );
 }
 
-function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
+function isOutputMessage(value: unknown): value is Message {
   if (!value || typeof value !== 'object' || Array.isArray(value)) {
     return false;
   }
@@ -890,12 +907,47 @@ function isExecutionStatus(value: unknown): value is EvaluationResult['execution
   );
 }
 
+function isTraceRecord(value: unknown): value is EvaluationResult['trace'] {
+  return (
+    !!value &&
+    typeof value === 'object' &&
+    !Array.isArray(value) &&
+    Array.isArray((value as { messages?: unknown }).messages) &&
+    Array.isArray((value as { events?: unknown }).events)
+  );
+}
+
 function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
   if (!value || typeof value !== 'object' || Array.isArray(value)) {
     return undefined;
   }
 
   const result = value as Record<string, unknown>;
+  const legacyOutputMessages = Array.isArray(result.output)
+    ? result.output.filter(isOutputMessage)
+    : undefined;
+  const output =
+    typeof result.output === 'string'
+      ? result.output
+      : extractLastAssistantContent(legacyOutputMessages);
+  const legacySummary =
+    result.trace && typeof result.trace === 'object' && !Array.isArray(result.trace)
+      ? (result.trace as TraceSummary)
+      : undefined;
+  const trace = isTraceRecord(result.trace)
+    ? result.trace
+    : buildTraceFromMessages({
+        input: Array.isArray(result.input) ? (result.input as EvaluationResult['input']) : [],
+        output: legacyOutputMessages,
+        summary: legacySummary,
+        finalOutput: output,
+        tokenUsage: result.tokenUsage as EvaluationResult['tokenUsage'],
+        costUsd: typeof result.costUsd === 'number' ? result.costUsd : undefined,
+        durationMs: typeof result.durationMs === 'number' ? result.durationMs : undefined,
+        target: typeof result.target === 'string' ? result.target : undefined,
+        testId: typeof result.testId === 'string' ? result.testId : undefined,
+      });
+
   return {
     ...result,
     timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
@@ -903,7 +955,8 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin
     score: typeof result.score === 'number' ? result.score : 0,
     assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
     target: typeof result.target === 'string' ? result.target : 'unknown',
-    output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
+    output,
+    trace,
     executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
   };
 }
@@ -959,23 +1012,10 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
   const lines: string[] = [];
 
   for (const result of results) {
-    const transcriptLines = toTranscriptJsonLines(
-      {
-        messages: [...(result.input ?? []), ...result.output],
-        source: {
-          provider: result.target,
-          sessionId: result.conversationId ?? result.testId,
-          startedAt: result.timestamp,
-        },
-        tokenUsage: result.tokenUsage,
-        durationMs: result.durationMs,
-        costUsd: result.costUsd,
-      },
-      {
-        testId: result.testId,
-        target: result.target,
-      },
-    );
+    const transcriptLines = traceToTranscriptJsonLines(result.trace, {
+      testId: result.testId,
+      target: result.target,
+    });
 
     lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
   }
@@ -1085,14 +1125,16 @@ export async function writePerTestArtifacts(
     if (input) {
       await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
     }
-    if (result.output && result.output.length > 0) {
+    if (result.output.length > 0 || result.trace.messages.length > 0) {
       const outputsDir = path.join(testDir, 'outputs');
       await mkdir(outputsDir, { recursive: true });
-      await writeFile(
-        path.join(outputsDir, 'response.md'),
-        formatOutputMarkdown(result.output),
-        'utf8',
-      );
+      if (result.output.length > 0) {
+        await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
+        // Deprecated compatibility alias. New consumers should use answer.md
+        // for scored output or transcript.jsonl for the full execution record.
+        await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
+      }
+      await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
     }
 
     const taskBundle = await materializeTaskBundleForResult({
@@ -1156,14 +1198,16 @@ export async function writeArtifactsFromResults(
       await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
     }
 
-    if (result.output && result.output.length > 0) {
+    if (result.output.length > 0 || result.trace.messages.length > 0) {
       const outputsDir = path.join(testDir, 'outputs');
       await mkdir(outputsDir, { recursive: true });
-      await writeFile(
-        path.join(outputsDir, 'response.md'),
-        formatOutputMarkdown(result.output),
-        'utf8',
-      );
+      if (result.output.length > 0) {
+        await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
+        // Deprecated compatibility alias. New consumers should use answer.md
+        // for scored output or transcript.jsonl for the full execution record.
+        await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
+      }
+      await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
     }
 
     const taskBundle = await materializeTaskBundleForResult({
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index 1ec215610..99dd71993 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -1,7 +1,12 @@
 import { existsSync, readFileSync } from 'node:fs';
 import path from 'node:path';
 
-import type { EvaluationResult } from '@agentv/core';
+import {
+  type EvaluationResult,
+  type TranscriptJsonLine,
+  buildTraceFromMessages,
+  traceFromTranscriptJsonLines,
+} from '@agentv/core';
 
 import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js';
 import {
@@ -32,6 +37,8 @@ export interface ResultManifestRecord {
   readonly timing_path?: string;
   readonly input_path?: string;
   readonly output_path?: string;
+  readonly answer_path?: string;
+  readonly transcript_path?: string;
   readonly response_path?: string;
   readonly artifact_dir?: string;
   readonly task_dir?: string;
@@ -106,20 +113,35 @@ function hydrateOutput(
   baseDir: string,
   record: ResultManifestRecord,
 ): EvaluationResult['output'] | undefined {
-  const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
+  const responseText = readOptionalText(
+    baseDir,
+    record.output_path ?? record.answer_path ?? record.response_path,
+  );
   if (!responseText) {
     return undefined;
   }
 
-  const messages = parseMarkdownMessages(responseText);
-  if (messages.length > 0) {
-    return messages.map((message) => ({
-      role: message.role as 'assistant' | 'user' | 'system' | 'tool',
-      content: message.content,
-    }));
+  return responseText.trimEnd();
+}
+
+function hydrateTrace(baseDir: string, record: ResultManifestRecord): EvaluationResult['trace'] {
+  const transcriptText = readOptionalText(baseDir, record.transcript_path);
+  if (transcriptText) {
+    try {
+      return traceFromTranscriptJsonLines(parseJsonlLines<TranscriptJsonLine>(transcriptText));
+    } catch {
+      // Fall through to a minimal trace below.
+    }
   }
 
-  return [{ role: 'assistant', content: responseText.trimEnd() }];
+  const output = hydrateOutput(baseDir, record) ?? '';
+  return buildTraceFromMessages({
+    input: hydrateInput(baseDir, record),
+    output: output ? [{ role: 'assistant', content: output }] : [],
+    finalOutput: output,
+    target: record.target,
+    testId: record.test_id,
+  });
 }
 
 function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): EvaluationResult {
@@ -176,7 +198,8 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
     durationMs: timing?.duration_ms ?? record.duration_ms,
     costUsd: record.cost_usd,
     input: hydrateInput(baseDir, record),
-    output: hydrateOutput(baseDir, record),
+    output: hydrateOutput(baseDir, record) ?? '',
+    trace: hydrateTrace(baseDir, record),
     metadata: record.metadata,
   } as EvaluationResult;
 }
diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts
index 2a451c24f..baeacb603 100644
--- a/packages/core/src/import/types.ts
+++ b/packages/core/src/import/types.ts
@@ -18,7 +18,7 @@ import { readFile } from 'node:fs/promises';
 
 import { toCamelCaseDeep, toSnakeCaseDeep } from '../evaluation/case-conversion.js';
 import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js';
-import { buildTraceFromMessages, type Trace } from '../evaluation/trace.js';
+import { type Trace, buildTraceFromMessages } from '../evaluation/trace.js';
 
 /**
  * A parsed transcript: ordered messages plus session metadata (internal camelCase).

From 1fdb9a2378069a44f951c7dcdb2d04fefc0d4d58 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 07:30:42 +0200
Subject: [PATCH 4/7] chore: remove repo-local ntm artifacts

---
 .ntm/palette.md    | 17 -----------------
 .ntm/personas.toml |  8 --------
 2 files changed, 25 deletions(-)
 delete mode 100644 .ntm/palette.md
 delete mode 100644 .ntm/personas.toml

diff --git a/.ntm/palette.md b/.ntm/palette.md
deleted file mode 100644
index 8b821ce3e..000000000
--- a/.ntm/palette.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Project Commands
-
-## Project
-### build | Build Project
-bun run build
-
-### test | Run Tests
-bun run test
-
-### typecheck | Typecheck Workspaces
-bun run typecheck
-
-### lint | Lint and Format Check
-bun run lint
-
-### validate-evals | Validate Example Eval YAML
-bun run validate:examples
diff --git a/.ntm/personas.toml b/.ntm/personas.toml
deleted file mode 100644
index 47fa361ca..000000000
--- a/.ntm/personas.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-# Project personas for NTM
-# Define specialized agent roles and behaviors here.
-# Example:
-# [[personas]]
-# name = "architect"
-# agent = "claude"
-# description = "High-level design and architecture"
-# system_prompt = """You are the architecture specialist..."""

From e68544947c1e880249fa528cb1675b3e778bf664 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 07:33:59 +0200
Subject: [PATCH 5/7] chore(targets): remove duplicate pi sdk openai target

---
 .agentv/targets.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
index 7586f1a2e..eef024a74 100644
--- a/.agentv/targets.yaml
+++ b/.agentv/targets.yaml
@@ -93,15 +93,6 @@ targets:
     thinking: low
     stream_log: raw
 
-  - name: pi-sdk-openai
-    provider: pi-coding-agent
-    subprovider: openai
-    base_url: ${{ OPENAI_ENDPOINT }}
-    api_key: ${{ OPENAI_API_KEY }}
-    model: gpt-5.5
-    grader_target: openai
-    thinking: low
-    stream_log: raw
 
   - name: pi-azure
     provider: pi-cli

From 3d9064c4e768781ae72b8ec0983b9460318a0183 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 09:49:47 +0200
Subject: [PATCH 6/7] fix(evaluation): stabilize final output trace contract

---
 apps/cli/test/commands/eval/aggregate.test.ts |  22 +++-
 .../commands/eval/artifact-writer.test.ts     |  69 ++++++++----
 .../commands/eval/output-messages.test.ts     | 103 ++----------------
 .../results/export-e2e-providers.test.ts      |  32 +++---
 apps/cli/test/commands/results/export.test.ts |  21 ++--
 apps/cli/test/commands/results/report.test.ts |  21 +++-
 apps/cli/test/commands/results/show.test.ts   |   2 +-
 apps/cli/test/commands/trace/trace.test.ts    |  14 ++-
 apps/cli/test/fixtures/mock-run-evaluation.ts |  45 +++++++-
 .../core/src/observability/otel-exporter.ts   |   6 +-
 .../evaluation/code-grader-multimodal.test.ts |  10 +-
 .../test/evaluation/conversation-mode.test.ts |  12 +-
 .../core/test/evaluation/orchestrator.test.ts |  15 +--
 .../core/test/fixtures/test-define-grader.ts  |   5 +-
 .../fixtures/test-grader-with-details.cjs     |  13 ++-
 packages/core/test/fixtures/test-grader.cjs   |  17 ++-
 .../test/observability/otel-exporter.test.ts  |  73 +++++++++----
 packages/eval/test/define-code-grader.test.ts |  31 ++++--
 .../eval/test/define-prompt-template.test.ts  |  32 +++---
 packages/eval/test/deprecation.test.ts        |   8 +-
 packages/eval/test/file-backed-output.test.ts |  15 +--
 21 files changed, 321 insertions(+), 245 deletions(-)

diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts
index c79bb7076..91200aa61 100644
--- a/apps/cli/test/commands/eval/aggregate.test.ts
+++ b/apps/cli/test/commands/eval/aggregate.test.ts
@@ -3,7 +3,7 @@ import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'nod
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
-import type { EvaluationResult } from '@agentv/core';
+import { type EvaluationResult, buildTraceFromMessages } from '@agentv/core';
 import { toSnakeCaseDeep } from '../../../src/utils/case-conversion.js';
 
 import {
@@ -14,16 +14,28 @@ import {
 } from '../../../src/commands/eval/artifact-writer.js';
 
 function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
-  return {
+  const result = {
     timestamp: '2026-04-13T00:00:00.000Z',
     testId: 'test-1',
     score: 0.9,
     assertions: [{ text: 'criterion-1', passed: true }],
-    output: [{ role: 'assistant' as const, content: 'test answer' }],
+    output: 'test answer',
     target: 'test-target',
     executionStatus: 'ok',
     ...overrides,
   } as EvaluationResult;
+
+  return {
+    ...result,
+    trace:
+      result.trace ??
+      buildTraceFromMessages({
+        output: result.output ? [{ role: 'assistant', content: result.output }] : [],
+        finalOutput: result.output,
+        target: result.target,
+        testId: result.testId,
+      }),
+  };
 }
 
 function writeJsonlIndex(dir: string, results: Partial<EvaluationResult>[]): string {
@@ -180,9 +192,7 @@ describe('writePerTestArtifacts', () => {
   });
 
   it('writes response.md for results with output', async () => {
-    const results = [
-      makeResult({ testId: 'test-1', output: [{ role: 'assistant' as const, content: 'hello' }] }),
-    ];
+    const results = [makeResult({ testId: 'test-1', output: 'hello' })];
 
     await writePerTestArtifacts(results, tmpDir);
 
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 56106fa1a..3bfca87bf 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -6,6 +6,7 @@ import {
   type EvalTest,
   type EvaluationResult,
   type GraderResult,
+  buildTraceFromMessages,
   parseYamlValue,
 } from '@agentv/core';
 
@@ -26,16 +27,33 @@ import {
 } from '../../../src/commands/eval/artifact-writer.js';
 
 function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
-  return {
+  const result = {
     timestamp: '2026-03-13T00:00:00.000Z',
     testId: 'test-1',
     score: 0.9,
     assertions: [{ text: 'criterion-1', passed: true }],
-    output: [{ role: 'assistant' as const, content: 'test answer' }],
+    output: 'test answer',
     target: 'test-target',
     executionStatus: 'ok',
     ...overrides,
   } as EvaluationResult;
+
+  return {
+    ...result,
+    trace:
+      result.trace ??
+      buildTraceFromMessages({
+        input: Array.isArray(result.input) ? result.input : [],
+        output: result.output ? [{ role: 'assistant', content: result.output }] : [],
+        finalOutput: result.output,
+        target: result.target,
+        testId: result.testId,
+        conversationId: result.conversationId,
+        tokenUsage: result.tokenUsage,
+        durationMs: result.durationMs,
+        costUsd: result.costUsd,
+      }),
+  };
 }
 
 function makeEvaluatorResult(overrides: Partial<GraderResult> = {}): GraderResult {
@@ -734,6 +752,20 @@ describe('writeArtifactsFromResults', () => {
   });
 
   it('writes transcript.jsonl as one message object per line', async () => {
+    const input = [{ role: 'user' as const, content: 'Inspect artifact output' }];
+    const output = [
+      {
+        role: 'assistant' as const,
+        content: 'Reading artifact-writer.ts',
+        toolCalls: [
+          {
+            tool: 'Read',
+            input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' },
+            output: 'file contents',
+          },
+        ],
+      },
+    ];
     const results = [
       makeResult({
         testId: 'transcript-case',
@@ -742,20 +774,19 @@ describe('writeArtifactsFromResults', () => {
         durationMs: 4200,
         costUsd: 0.25,
         tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 },
-        input: [{ role: 'user' as const, content: 'Inspect artifact output' }],
-        output: [
-          {
-            role: 'assistant' as const,
-            content: 'Reading artifact-writer.ts',
-            toolCalls: [
-              {
-                tool: 'Read',
-                input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' },
-                output: 'file contents',
-              },
-            ],
-          },
-        ],
+        input,
+        output: 'Reading artifact-writer.ts',
+        trace: buildTraceFromMessages({
+          input,
+          output,
+          finalOutput: 'Reading artifact-writer.ts',
+          target: 'codex',
+          testId: 'transcript-case',
+          conversationId: 'session-123',
+          tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 },
+          durationMs: 4200,
+          costUsd: 0.25,
+        }),
       }),
     ];
 
@@ -779,7 +810,6 @@ describe('writeArtifactsFromResults', () => {
         source: {
           provider: 'codex',
           session_id: 'session-123',
-          timestamp: '2026-03-13T00:00:00.000Z',
         },
       },
       {
@@ -801,7 +831,6 @@ describe('writeArtifactsFromResults', () => {
         source: {
           provider: 'codex',
           session_id: 'session-123',
-          timestamp: '2026-03-13T00:00:00.000Z',
         },
       },
     ]);
@@ -822,7 +851,7 @@ describe('writeArtifactsFromResults', () => {
         target: 'baseline',
         assertions: [{ text: 'baseline-check', passed: true, evidence: 'baseline evidence' }],
         input: [{ role: 'user' as const, content: 'baseline input' }],
-        output: [{ role: 'assistant' as const, content: 'baseline output' }],
+        output: 'baseline output',
       }),
     ];
 
@@ -1136,7 +1165,7 @@ describe('writeArtifacts (from JSONL file)', () => {
         test_id: 'from-file',
         score: 0.85,
         assertions: [{ text: 'pass-1', passed: true }],
-        output: [{ role: 'assistant', content: 'file answer' }],
+        output: 'file answer',
         target: 'default',
         execution_status: 'ok',
         duration_ms: 12000,
diff --git a/apps/cli/test/commands/eval/output-messages.test.ts b/apps/cli/test/commands/eval/output-messages.test.ts
index 5f345615f..a49dd0d6c 100644
--- a/apps/cli/test/commands/eval/output-messages.test.ts
+++ b/apps/cli/test/commands/eval/output-messages.test.ts
@@ -1,105 +1,16 @@
 import { describe, expect, it } from 'bun:test';
 
-import type { Message } from '@agentv/core';
-
 import { trimOutputMessages } from '../../../src/commands/eval/run-eval.js';
 
-const makeMessages = (): readonly Message[] => [
-  { role: 'user', content: 'Hello', startTime: '2024-01-01T00:00:00Z', durationMs: 10 },
-  {
-    role: 'assistant',
-    content: 'Hi there',
-    toolCalls: [{ id: 'tc1', name: 'read', arguments: '{}' }],
-    startTime: '2024-01-01T00:00:01Z',
-  },
-  { role: 'tool', content: 'file contents', name: 'read', durationMs: 50 },
-  { role: 'assistant', content: 'Done!', startTime: '2024-01-01T00:00:02Z', durationMs: 100 },
-];
-
 describe('trimOutputMessages', () => {
-  describe('default (outputMessages = 1)', () => {
-    it('returns only the last assistant message trimmed to { role, content }', () => {
-      const result = trimOutputMessages(makeMessages(), 1);
-      expect(result).toEqual([{ role: 'assistant', content: 'Done!' }]);
-    });
-
-    it('returns empty array when no assistant message exists', () => {
-      const messages: readonly Message[] = [{ role: 'user', content: 'Hello' }];
-      const result = trimOutputMessages(messages, 1);
-      expect(result).toEqual([]);
-    });
-
-    it('strips toolCalls, startTime, durationMs from the last assistant message', () => {
-      const messages: readonly Message[] = [
-        {
-          role: 'assistant',
-          content: 'response',
-          toolCalls: [{ id: 'tc1', name: 'read', arguments: '{}' }],
-          startTime: '2024-01-01T00:00:00Z',
-          durationMs: 500,
-        },
-      ];
-      const result = trimOutputMessages(messages, 1);
-      expect(result).toEqual([{ role: 'assistant', content: 'response' }]);
-      expect(result[0]).not.toHaveProperty('toolCalls');
-      expect(result[0]).not.toHaveProperty('startTime');
-      expect(result[0]).not.toHaveProperty('durationMs');
-    });
-  });
-
-  describe('outputMessages = N (numeric)', () => {
-    it('returns last N messages (any role) trimmed to { role, content }', () => {
-      const result = trimOutputMessages(makeMessages(), 3);
-      expect(result).toEqual([
-        { role: 'assistant', content: 'Hi there' },
-        { role: 'tool', content: 'file contents' },
-        { role: 'assistant', content: 'Done!' },
-      ]);
-    });
-
-    it('returns all messages when N exceeds message count', () => {
-      const result = trimOutputMessages(makeMessages(), 100);
-      expect(result).toHaveLength(4);
-      expect(result[0]).toEqual({ role: 'user', content: 'Hello' });
-    });
-
-    it('strips metadata from all returned messages', () => {
-      const result = trimOutputMessages(makeMessages(), 2);
-      for (const msg of result) {
-        expect(Object.keys(msg).sort()).toEqual(['content', 'role']);
-      }
-    });
-  });
-
-  describe('outputMessages = "all"', () => {
-    it('returns all messages trimmed to { role, content }', () => {
-      const result = trimOutputMessages(makeMessages(), 'all');
-      expect(result).toHaveLength(4);
-      expect(result).toEqual([
-        { role: 'user', content: 'Hello' },
-        { role: 'assistant', content: 'Hi there' },
-        { role: 'tool', content: 'file contents' },
-        { role: 'assistant', content: 'Done!' },
-      ]);
-    });
-
-    it('strips all metadata fields from every message', () => {
-      const result = trimOutputMessages(makeMessages(), 'all');
-      for (const msg of result) {
-        expect(msg).not.toHaveProperty('toolCalls');
-        expect(msg).not.toHaveProperty('startTime');
-        expect(msg).not.toHaveProperty('durationMs');
-        expect(msg).not.toHaveProperty('name');
-      }
-    });
+  it('leaves final-answer output unchanged', () => {
+    expect(trimOutputMessages('Done!', 1)).toBe('Done!');
+    expect(trimOutputMessages('Done!', 3)).toBe('Done!');
+    expect(trimOutputMessages('Done!', 'all')).toBe('Done!');
   });
 
-  describe('edge cases', () => {
-    it('handles empty output array', () => {
-      const empty: readonly Message[] = [];
-      expect(trimOutputMessages(empty, 1)).toEqual([]);
-      expect(trimOutputMessages(empty, 5)).toEqual([]);
-      expect(trimOutputMessages(empty, 'all')).toEqual([]);
-    });
+  it('preserves empty final-answer output', () => {
+    expect(trimOutputMessages('', 1)).toBe('');
+    expect(trimOutputMessages('', 'all')).toBe('');
   });
 });
diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
index 2d8cd1df7..19c0e4be4 100644
--- a/apps/cli/test/commands/results/export-e2e-providers.test.ts
+++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -29,7 +29,7 @@ const CLAUDE_CLI_RESULT = {
     { text: 'Correct answer', passed: true, evidence: 'Matched expected output' },
     { text: 'Used reasoning', passed: true },
   ],
-  output: [{ role: 'assistant', content: 'The answer is 42, derived through extended thinking.' }],
+  output: 'The answer is 42, derived through extended thinking.',
   target: 'claude-cli',
   scores: [
     {
@@ -66,7 +66,7 @@ const CODEX_RESULT = {
     { text: 'File edited correctly', passed: true },
     { text: 'No extra changes', passed: true },
   ],
-  output: [{ role: 'assistant', content: 'Applied the requested edit to src/main.ts.' }],
+  output: 'Applied the requested edit to src/main.ts.',
   target: 'codex',
   scores: [
     {
@@ -102,7 +102,7 @@ const COPILOT_RESULT = {
     { text: 'Code completion correct', passed: true },
     { text: 'Follows style guide', passed: false, evidence: 'Missing semicolons' },
   ],
-  output: [{ role: 'assistant', content: 'function add(a, b) { return a + b }' }],
+  output: 'function add(a, b) { return a + b }',
   target: 'copilot-cli',
   scores: [
     {
@@ -131,7 +131,7 @@ const PI_RESULT = {
     { text: 'Refactored correctly', passed: true },
     { text: 'Tests pass', passed: false, evidence: 'Test suite has 1 failure' },
   ],
-  output: [{ role: 'assistant', content: 'Refactored the module to use dependency injection.' }],
+  output: 'Refactored the module to use dependency injection.',
   target: 'pi-coding-agent',
   duration_ms: 15000,
   token_usage: { input: 4000, output: 2000 },
@@ -146,7 +146,7 @@ const LLM_AZURE_RESULT = {
   suite: 'multi-provider',
   score: 1.0,
   assertions: [{ text: 'Analysis correct', passed: true }],
-  output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }],
+  output: 'The code has a race condition in the connection pool.',
   target: 'azure-o4-mini',
   scores: [
     {
@@ -169,7 +169,7 @@ const LLM_GPT_RESULT = {
   suite: 'multi-provider',
   score: 0.8,
   assertions: [{ text: 'Analysis correct', passed: true }],
-  output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }],
+  output: 'There might be a concurrency issue.',
   target: 'gpt-4.1',
   duration_ms: 2800,
   token_usage: { input: 1200, output: 400 },
@@ -184,7 +184,7 @@ const MINIMAL_RESULT = {
   suite: 'multi-provider',
   score: 0.5,
   assertions: [{ text: 'Exists', passed: true }],
-  output: [{ role: 'assistant', content: 'Response.' }],
+  output: 'Response.',
   target: 'mock',
   execution_status: 'ok',
 };
@@ -196,7 +196,7 @@ const ERROR_RESULT = {
   suite: 'multi-provider',
   score: 0,
   assertions: [],
-  output: [],
+  output: '',
   target: 'claude-cli',
   error: 'Agent timed out after 120s',
   duration_ms: 120000,
@@ -527,7 +527,7 @@ describe('export e2e — multi-provider metrics verification', () => {
 
   // ── Output artifact tests ──────────────────────────────────────────────
 
-  describe('<test-id>/outputs/response.md — human-readable agent responses', () => {
+  describe('<test-id>/outputs/answer.md — human-readable agent responses', () => {
     it('should write answer text for each provider as markdown', async () => {
       const outputDir = path.join(tempDir, 'outputs');
       const content = toJsonl(CLAUDE_CLI_RESULT, CODEX_RESULT, COPILOT_RESULT);
@@ -536,24 +536,24 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       expect(
         readFileSync(
-          path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'response.md'),
+          path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'answer.md'),
           'utf8',
         ),
-      ).toBe('@[assistant]:\nThe answer is 42, derived through extended thinking.');
+      ).toBe('The answer is 42, derived through extended thinking.');
 
       expect(
         readFileSync(
-          path.join(artifactDir(outputDir, CODEX_RESULT), 'outputs', 'response.md'),
+          path.join(artifactDir(outputDir, CODEX_RESULT), 'outputs', 'answer.md'),
           'utf8',
         ),
-      ).toBe('@[assistant]:\nApplied the requested edit to src/main.ts.');
+      ).toBe('Applied the requested edit to src/main.ts.');
 
       expect(
         readFileSync(
-          path.join(artifactDir(outputDir, COPILOT_RESULT), 'outputs', 'response.md'),
+          path.join(artifactDir(outputDir, COPILOT_RESULT), 'outputs', 'answer.md'),
           'utf8',
         ),
-      ).toBe('@[assistant]:\nfunction add(a, b) { return a + b }');
+      ).toBe('function add(a, b) { return a + b }');
     });
 
     it('should not write output file for error result with empty answer', async () => {
@@ -563,7 +563,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       expect(
-        existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'outputs', 'response.md')),
+        existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'outputs', 'answer.md')),
       ).toBe(false);
     });
   });
diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
index 75f599e33..13a7b49eb 100644
--- a/apps/cli/test/commands/results/export.test.ts
+++ b/apps/cli/test/commands/results/export.test.ts
@@ -26,7 +26,7 @@ const RESULT_FULL = {
     { text: 'Says hello', passed: true },
     { text: 'Uses name', passed: true },
   ],
-  output: [{ role: 'assistant', content: 'Hello, Alice!' }],
+  output: 'Hello, Alice!',
   target: 'gpt-4o',
   scores: [
     {
@@ -88,7 +88,7 @@ const RESULT_NO_TRACE = {
   suite: 'demo',
   score: 1.0,
   assertions: [{ text: 'Correct', passed: true }],
-  output: [{ role: 'assistant', content: 'Yes.' }],
+  output: 'Yes.',
   target: 'default',
   token_usage: { input: 50, output: 20 },
   cost_usd: 0.001,
@@ -210,7 +210,10 @@ describe('results export', () => {
       execution_status: 'ok',
       grading_path: 'demo/test-greeting/grading.json',
       timing_path: 'demo/test-greeting/timing.json',
-      output_path: 'demo/test-greeting/outputs/response.md',
+      output_path: 'demo/test-greeting/outputs/answer.md',
+      answer_path: 'demo/test-greeting/outputs/answer.md',
+      response_path: 'demo/test-greeting/outputs/response.md',
+      transcript_path: 'demo/test-greeting/outputs/transcript.jsonl',
       input_path: 'demo/test-greeting/input.md',
     });
   });
@@ -270,15 +273,19 @@ describe('results export', () => {
     expect(existsSync(perTestTimingPath)).toBe(true);
   });
 
-  it('should write answer text to <test-id>/outputs/response.md as human-readable markdown', async () => {
+  it('should write answer text to <test-id>/outputs/answer.md as human-readable markdown', async () => {
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl(RESULT_FULL);
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const answerPath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'response.md');
+    const answerPath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'answer.md');
     expect(existsSync(answerPath)).toBe(true);
-    expect(readFileSync(answerPath, 'utf8')).toBe('@[assistant]:\nHello, Alice!');
+    expect(readFileSync(answerPath, 'utf8')).toBe('Hello, Alice!');
+
+    const responsePath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'response.md');
+    expect(existsSync(responsePath)).toBe(true);
+    expect(readFileSync(responsePath, 'utf8')).toBe('Hello, Alice!');
   });
 
   it('should group results by target in benchmark.json', async () => {
@@ -345,7 +352,7 @@ describe('results export', () => {
     const answerPath = path.join(
       artifactDir(outputDir, RESULT_DIFFERENT_TARGET),
       'outputs',
-      'response.md',
+      'answer.md',
     );
     expect(existsSync(answerPath)).toBe(false);
   });
diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts
index e2040eeea..af2d9769f 100644
--- a/apps/cli/test/commands/results/report.test.ts
+++ b/apps/cli/test/commands/results/report.test.ts
@@ -4,7 +4,7 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 import vm from 'node:vm';
 
-import type { EvaluationResult, GraderResult } from '@agentv/core';
+import { type EvaluationResult, type GraderResult, buildTraceFromMessages } from '@agentv/core';
 
 import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js';
 import {
@@ -29,13 +29,13 @@ function makeScore(
 }
 
 function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
-  return {
+  const result = {
     timestamp: '2026-04-15T01:00:00.000Z',
     testId: 'test-1',
     suite: 'default',
     score: 1,
     assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }],
-    output: [{ role: 'assistant', content: 'answer' }],
+    output: 'answer',
     input: [{ role: 'user', content: 'question' }],
     target: 'default',
     executionStatus: 'ok',
@@ -43,6 +43,21 @@ function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult
     durationMs: 1200,
     ...overrides,
   };
+
+  return {
+    ...result,
+    trace:
+      result.trace ??
+      buildTraceFromMessages({
+        input: Array.isArray(result.input) ? result.input : [],
+        output: result.output ? [{ role: 'assistant', content: result.output }] : [],
+        finalOutput: result.output,
+        target: result.target,
+        testId: result.testId,
+        tokenUsage: result.tokenUsage,
+        durationMs: result.durationMs,
+      }),
+  };
 }
 
 describe('results report', () => {
diff --git a/apps/cli/test/commands/results/show.test.ts b/apps/cli/test/commands/results/show.test.ts
index e4a625b2d..700d43b1f 100644
--- a/apps/cli/test/commands/results/show.test.ts
+++ b/apps/cli/test/commands/results/show.test.ts
@@ -13,7 +13,7 @@ const makeResult = (overrides: Partial<EvaluationResult> = {}): EvaluationResult
       { text: "contains 'Dear'", passed: false, evidence: "'Dear' not found" },
       { text: 'contains greeting', passed: true },
     ],
-    output: [{ role: 'assistant', content: 'Hi there!' }],
+    output: 'Hi there!',
     input: [{ role: 'user', content: 'Give a formal greeting' }],
     executionStatus: 'success',
     durationMs: 1200,
diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
index ed091ef41..c0b5ec535 100644
--- a/apps/cli/test/commands/trace/trace.test.ts
+++ b/apps/cli/test/commands/trace/trace.test.ts
@@ -205,7 +205,12 @@ describe('trace utils', () => {
 
       expect(results).toHaveLength(1);
       expect(results[0].test_id).toBe('test-2');
-      expect(results[0].trace).toBeUndefined();
+      expect(results[0].trace).toMatchObject({
+        schema_version: 'agentv.trace.v1',
+        event_count: 0,
+        messages: [],
+        events: [],
+      });
     });
 
     it('loads index.jsonl directly', () => {
@@ -216,7 +221,12 @@ describe('trace utils', () => {
 
       expect(results).toHaveLength(1);
       expect(results[0].test_id).toBe('test-2');
-      expect(results[0].trace).toBeUndefined();
+      expect(results[0].trace).toMatchObject({
+        schema_version: 'agentv.trace.v1',
+        event_count: 0,
+        messages: [],
+        events: [],
+      });
     });
 
     it('loads simple trace jsonl exports and keeps spans available for trace commands', () => {
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
index 5f92fee9b..ccb022e31 100644
--- a/apps/cli/test/fixtures/mock-run-evaluation.ts
+++ b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -41,7 +41,8 @@ interface EvaluationResultLike {
     readonly passed: boolean;
     readonly evidence?: string;
   }[];
-  readonly output: readonly { readonly role: string; readonly content: string }[];
+  readonly output: string;
+  readonly trace: Record<string, unknown>;
   readonly expectedAspectCount: number;
   readonly target: string;
   readonly timestamp: string;
@@ -63,20 +64,53 @@ function evalCaseIds(evalCases: ReadonlyArray<unknown> | undefined): readonly st
     .filter((id): id is string => id !== undefined);
 }
 
+function buildTrace(targetName: string, testId: string, output: string): Record<string, unknown> {
+  const message = { role: 'assistant', content: output };
+  return {
+    schemaVersion: 'agentv.trace.v1',
+    eventCount: 2,
+    toolCalls: {},
+    errorCount: 0,
+    llmCallCount: 1,
+    messages: [message],
+    events: [
+      {
+        eventId: 'message-0',
+        ordinal: 0,
+        type: 'message',
+        message,
+        metadata: { message_index: 0 },
+      },
+      {
+        eventId: 'final-response',
+        parentEventId: 'message-0',
+        ordinal: 1,
+        type: 'final_response',
+        message,
+        metadata: { message_index: 0 },
+      },
+    ],
+    metadata: { provider: 'mock', target: targetName, eval_case_id: testId },
+  };
+}
+
 function buildResult(targetName: string, testId: string, index: number): EvaluationResultLike {
   const baseTime = new Date('2024-01-01T00:00:00.000Z');
   if (testId === 'case-alpha') {
+    const output = 'Alpha answer';
     return {
       testId: 'case-alpha',
       score: 0.6,
       assertions: [{ text: 'alpha', passed: true }],
-      output: [{ role: 'assistant', content: 'Alpha answer' }],
+      output,
+      trace: buildTrace(targetName, 'case-alpha', output),
       expectedAspectCount: 1,
       target: targetName,
       timestamp: baseTime.toISOString(),
     };
   }
   if (testId === 'case-beta') {
+    const output = 'Beta answer';
     return {
       testId: 'case-beta',
       score: 0.9,
@@ -85,17 +119,20 @@ function buildResult(targetName: string, testId: string, index: number): Evaluat
         { text: 'gamma', passed: true },
         { text: 'delta', passed: false },
       ],
-      output: [{ role: 'assistant', content: 'Beta answer' }],
+      output,
+      trace: buildTrace(targetName, 'case-beta', output),
       expectedAspectCount: 3,
       target: targetName,
       timestamp: new Date(baseTime.getTime() + 60_000).toISOString(),
     };
   }
+  const output = `${testId} answer`;
   return {
     testId,
     score: 1,
     assertions: [{ text: testId, passed: true }],
-    output: [{ role: 'assistant', content: `${testId} answer` }],
+    output,
+    trace: buildTrace(targetName, testId, output),
     expectedAspectCount: 1,
     target: targetName,
     timestamp: new Date(baseTime.getTime() + index * 60_000).toISOString(),
diff --git a/packages/core/src/observability/otel-exporter.ts b/packages/core/src/observability/otel-exporter.ts
index d3c400a9a..7a39635aa 100644
--- a/packages/core/src/observability/otel-exporter.ts
+++ b/packages/core/src/observability/otel-exporter.ts
@@ -216,8 +216,10 @@ export class OtelTraceExporter {
             rootSpan.setAttribute('agentv.trace.llm_call_count', t.llmCallCount);
         }
 
-        // Child spans from trace messages (--trace mode)
-        const traceMessages = result.trace.messages;
+        // Child spans from canonical trace messages.
+        // Some callers may still export older result artifacts while migrating,
+        // so tolerate a missing trace instead of crashing the exporter.
+        const traceMessages = result.trace?.messages ?? [];
         if (traceMessages.length > 0) {
           const parentCtx = api.trace.setSpan(api.context.active(), rootSpan);
 
diff --git a/packages/core/test/evaluation/code-grader-multimodal.test.ts b/packages/core/test/evaluation/code-grader-multimodal.test.ts
index 25f92711d..130eb5e0e 100644
--- a/packages/core/test/evaluation/code-grader-multimodal.test.ts
+++ b/packages/core/test/evaluation/code-grader-multimodal.test.ts
@@ -269,8 +269,9 @@ describe('CodeGrader multimodal integration', () => {
     expect(result.score).toBe(1.0);
     const details = result.details as Record<string, unknown>;
     const payload = details.payload as Record<string, unknown>;
-    const outputMsgs = payload.output as Record<string, unknown>[];
-    expect(outputMsgs[0].content).toBe('Hello world');
+    expect(payload.output).toBe('answer');
+    const messages = payload.messages as Record<string, unknown>[];
+    expect(messages[0].content).toBe('Hello world');
   });
 
   it('materializes image data URIs in output for grader', async () => {
@@ -300,8 +301,9 @@ describe('CodeGrader multimodal integration', () => {
     // Verify the grader received the payload with image paths (not data URIs)
     const details = result.details as Record<string, unknown>;
     const payload = details.payload as Record<string, unknown>;
-    const outputMsgs = payload.output as Record<string, unknown>[];
-    const content = outputMsgs[0].content as Record<string, unknown>[];
+    expect(payload.output).toBe('answer');
+    const messages = payload.messages as Record<string, unknown>[];
+    const content = messages[0].content as Record<string, unknown>[];
 
     // Text block preserved
     expect(content[0]).toEqual({ type: 'text', text: 'Generated chart:' });
diff --git a/packages/core/test/evaluation/conversation-mode.test.ts b/packages/core/test/evaluation/conversation-mode.test.ts
index 2eeb8eee4..cdeca3c30 100644
--- a/packages/core/test/evaluation/conversation-mode.test.ts
+++ b/packages/core/test/evaluation/conversation-mode.test.ts
@@ -549,7 +549,7 @@ describe('runEvalCase — conversation mode', () => {
     expect(turn2Score?.score).toBe(1.0);
   });
 
-  it('output contains full conversation transcript with all user and assistant messages', async () => {
+  it('output is the final answer while trace contains the full conversation transcript', async () => {
     const provider = new SequenceProvider('mock', [
       assistantResponse('Answer 1'),
       assistantResponse('Answer 2'),
@@ -574,10 +574,12 @@ describe('runEvalCase — conversation mode', () => {
       now: nowFn,
     });
 
-    // Output should have all messages from the conversation
-    const output = result.output ?? [];
-    const userMessages = output.filter((m) => m.role === 'user');
-    const assistantMessages = output.filter((m) => m.role === 'assistant');
+    // Output is only the final answer/scored result.
+    expect(result.output).toBe('Answer 2');
+
+    // Trace preserves all messages from the conversation.
+    const userMessages = result.trace.messages.filter((m) => m.role === 'user');
+    const assistantMessages = result.trace.messages.filter((m) => m.role === 'assistant');
 
     expect(userMessages.length).toBe(2);
     expect(assistantMessages.length).toBe(2);
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 29a339bf0..ebdb2fcee 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -801,10 +801,8 @@ describe('runEvalCase trace integration', () => {
     expect(result.trace?.errorCount).toBe(0);
   });
 
-  it('omits trace when provider returns no output', async () => {
-    const provider = new TraceProvider('mock', {
-      output: [{ role: 'assistant', content: 'The weather is sunny' }],
-    });
+  it('builds a canonical trace even when provider returns no output', async () => {
+    const provider = new TraceProvider('mock', {});
 
     const result = await runEvalCase({
       evalCase: traceTestCase,
@@ -813,7 +811,10 @@ describe('runEvalCase trace integration', () => {
       evaluators: evaluatorRegistry,
     });
 
-    expect(result.trace).toBeUndefined();
+    expect(result.output).toBe('');
+    expect(result.trace).toBeDefined();
+    expect(result.trace.messages.map((message) => message.role)).toEqual(['user', 'assistant']);
+    expect(result.trace.events.some((event) => event.type === 'final_response')).toBe(true);
   });
 
   it('includes trace when provider reports tokenUsage without output', async () => {
@@ -907,7 +908,7 @@ describe('runEvalCase trace integration', () => {
     expect(result.scores?.[0]?.verdict).toBe('pass');
   });
 
-  it('fails tool-trajectory evaluator when no trace available', async () => {
+  it('fails tool-trajectory evaluator when the trace has no matching tools', async () => {
     const provider = new TraceProvider('mock', {
       output: [{ role: 'assistant', content: 'Result' }],
     });
@@ -944,7 +945,7 @@ describe('runEvalCase trace integration', () => {
     expect(result.score).toBe(0);
     expect(result.scores?.[0]?.verdict).toBe('fail');
     expect(result.scores?.[0]?.assertions.filter((a) => !a.passed).map((a) => a.text)).toContain(
-      'No trace available for evaluation',
+      'search: called 0 times (required >=1)',
     );
   });
 
diff --git a/packages/core/test/fixtures/test-define-grader.ts b/packages/core/test/fixtures/test-define-grader.ts
index f5c41f75d..344a5b09c 100644
--- a/packages/core/test/fixtures/test-define-grader.ts
+++ b/packages/core/test/fixtures/test-define-grader.ts
@@ -7,8 +7,9 @@ import { defineCodeGrader } from '../../../eval/src/index.js';
 export default defineCodeGrader(({ output, criteria }) => {
   const assertions: { text: string; passed: boolean }[] = [];
 
-  // Extract text from the output message array
-  const candidateText = (output ?? []).map((m) => String(m.content ?? '')).join(' ');
+  // `output` is the final answer/scored result. Transcript-aware graders should
+  // use messages/trace instead.
+  const candidateText = output ?? '';
 
   // Simple check: does candidate mention the criteria keywords?
   const outcomeWords = criteria.toLowerCase().split(/\s+/);
diff --git a/packages/core/test/fixtures/test-grader-with-details.cjs b/packages/core/test/fixtures/test-grader-with-details.cjs
index b11c34d36..54e40eed2 100644
--- a/packages/core/test/fixtures/test-grader-with-details.cjs
+++ b/packages/core/test/fixtures/test-grader-with-details.cjs
@@ -7,9 +7,16 @@ const fs = require('node:fs');
 const input = JSON.parse(fs.readFileSync(0, 'utf8'));
 
 const hasExpected = Array.isArray(input.expected_output);
-// Extract candidate text from the output message array
-const outputMessages = Array.isArray(input.output) ? input.output : [];
-const candidateText = outputMessages.map((m) => String(m.content ?? '')).join('');
+// `output` is the final answer/scored result. Keep a tiny legacy fallback so
+// this fixture can still explain failures if an old message-array payload leaks.
+const candidateText =
+  typeof input.output === 'string'
+    ? input.output
+    : typeof input.answer === 'string'
+      ? input.answer
+      : Array.isArray(input.output)
+        ? input.output.map((m) => String(m.content ?? '')).join('')
+        : '';
 const hasCandidate = candidateText.length > 0;
 
 // Emit details with structured metrics
diff --git a/packages/core/test/fixtures/test-grader.cjs b/packages/core/test/fixtures/test-grader.cjs
index e341fb69f..5e042b807 100644
--- a/packages/core/test/fixtures/test-grader.cjs
+++ b/packages/core/test/fixtures/test-grader.cjs
@@ -4,11 +4,18 @@ const fs = require('node:fs');
 const input = JSON.parse(fs.readFileSync(0, 'utf8'));
 
 const hasExpected = Array.isArray(input.expected_output);
-// Extract candidate text from the output message array
-const outputMessages = Array.isArray(input.output) ? input.output : [];
-const candidateText = outputMessages
-  .map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content)))
-  .join('');
+// `output` is the final answer/scored result. Keep a tiny legacy fallback so
+// this fixture can still explain failures if an old message-array payload leaks.
+const candidateText =
+  typeof input.output === 'string'
+    ? input.output
+    : typeof input.answer === 'string'
+      ? input.answer
+      : Array.isArray(input.output)
+        ? input.output
+            .map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content)))
+            .join('')
+        : '';
 const hasCandidate = candidateText.length > 0;
 let candidateDecisionOk = false;
 
diff --git a/packages/core/test/observability/otel-exporter.test.ts b/packages/core/test/observability/otel-exporter.test.ts
index 9e2035a1a..c10ef1ba0 100644
--- a/packages/core/test/observability/otel-exporter.test.ts
+++ b/packages/core/test/observability/otel-exporter.test.ts
@@ -4,6 +4,7 @@
  */
 
 import { afterEach, describe, expect, it } from 'bun:test';
+import { buildTraceFromMessages } from '../../src/evaluation/trace.js';
 import { OTEL_BACKEND_PRESETS, OtelTraceExporter } from '../../src/observability/otel-exporter.js';
 
 // ---------------------------------------------------------------------------
@@ -225,7 +226,13 @@ describe('W3C traceparent propagation', () => {
       testId: 'test-tp',
       target: 'my-agent',
       score: 1,
-      output: [{ role: 'assistant' as const, content: 'ok' }],
+      output: 'ok',
+      trace: buildTraceFromMessages({
+        output: [{ role: 'assistant' as const, content: 'ok' }],
+        finalOutput: 'ok',
+        target: 'my-agent',
+        testId: 'test-tp',
+      }),
       timestamp: new Date().toISOString(),
     }) as unknown as Parameters<OtelTraceExporter['exportResult']>[0];
 
@@ -353,14 +360,20 @@ describe('Per-span token usage metrics', () => {
       target: 'my-agent',
       score: 1,
       timestamp: new Date().toISOString(),
-      output: [
-        {
-          role: 'assistant',
-          content: 'hello',
-          metadata: { model: 'gpt-4' },
-          tokenUsage: { input: 100, output: 50, cached: 25 },
-        },
-      ],
+      output: 'hello',
+      trace: buildTraceFromMessages({
+        output: [
+          {
+            role: 'assistant',
+            content: 'hello',
+            metadata: { model: 'gpt-4' },
+            tokenUsage: { input: 100, output: 50, cached: 25 },
+          },
+        ],
+        finalOutput: 'hello',
+        target: 'my-agent',
+        testId: 'test-tokens',
+      }),
     } as unknown as Parameters<OtelTraceExporter['exportResult']>[0];
 
     await setup.exporter.exportResult(result);
@@ -385,13 +398,19 @@ describe('Per-span token usage metrics', () => {
       target: 'my-agent',
       score: 1,
       timestamp: new Date().toISOString(),
-      output: [
-        {
-          role: 'assistant',
-          content: 'hello',
-          metadata: { model: 'gpt-4' },
-        },
-      ],
+      output: 'hello',
+      trace: buildTraceFromMessages({
+        output: [
+          {
+            role: 'assistant',
+            content: 'hello',
+            metadata: { model: 'gpt-4' },
+          },
+        ],
+        finalOutput: 'hello',
+        target: 'my-agent',
+        testId: 'test-no-tokens',
+      }),
     } as unknown as Parameters<OtelTraceExporter['exportResult']>[0];
 
     await setup.exporter.exportResult(result);
@@ -416,14 +435,20 @@ describe('Per-span token usage metrics', () => {
       target: 'my-agent',
       score: 1,
       timestamp: new Date().toISOString(),
-      output: [
-        {
-          role: 'assistant',
-          content: 'hello',
-          metadata: { model: 'gpt-4' },
-          tokenUsage: { input: 200, output: 75 },
-        },
-      ],
+      output: 'hello',
+      trace: buildTraceFromMessages({
+        output: [
+          {
+            role: 'assistant',
+            content: 'hello',
+            metadata: { model: 'gpt-4' },
+            tokenUsage: { input: 200, output: 75 },
+          },
+        ],
+        finalOutput: 'hello',
+        target: 'my-agent',
+        testId: 'test-partial-tokens',
+      }),
     } as unknown as Parameters<OtelTraceExporter['exportResult']>[0];
 
     await setup.exporter.exportResult(result);
diff --git a/packages/eval/test/define-code-grader.test.ts b/packages/eval/test/define-code-grader.test.ts
index e09c0ba49..e17e93230 100644
--- a/packages/eval/test/define-code-grader.test.ts
+++ b/packages/eval/test/define-code-grader.test.ts
@@ -14,6 +14,16 @@ import {
   MessageSchema,
 } from '../src/schemas.js';
 
+const makeTrace = (overrides: Record<string, unknown> = {}) => ({
+  schemaVersion: 'agentv.trace.v1',
+  eventCount: 3,
+  toolCalls: { read: 2, write: 1 },
+  errorCount: 0,
+  messages: [],
+  events: [],
+  ...overrides,
+});
+
 // ---------------------------------------------------------------------------
 // Content schemas
 // ---------------------------------------------------------------------------
@@ -180,11 +190,7 @@ describe('CodeGraderInputSchema', () => {
   it('accepts optional trace', () => {
     const inputWithTrace = {
       ...validInput,
-      trace: {
-        eventCount: 3,
-        toolCalls: { read: 2, write: 1 },
-        errorCount: 0,
-      },
+      trace: makeTrace(),
     };
     const result = CodeGraderInputSchema.parse(inputWithTrace);
     expect(result.trace?.eventCount).toBe(3);
@@ -209,10 +215,11 @@ describe('CodeGraderInputSchema', () => {
     expect(result.config).toEqual({ maxToolCalls: 10, strictMode: true });
   });
 
-  it('accepts optional output with toolCalls', () => {
+  it('accepts final output plus transcript messages with toolCalls', () => {
     const inputWithOutput = {
       ...validInput,
-      output: [
+      output: 'Reading file...',
+      messages: [
         {
           role: 'assistant',
           content: 'Reading file...',
@@ -221,13 +228,15 @@ describe('CodeGraderInputSchema', () => {
       ],
     };
     const result = CodeGraderInputSchema.parse(inputWithOutput);
-    expect(result.output?.[0].toolCalls?.[0].tool).toBe('read');
+    expect(result.output).toBe('Reading file...');
+    expect(result.messages?.[0].toolCalls?.[0].tool).toBe('read');
   });
 
-  it('accepts output with Content[] containing image blocks', () => {
+  it('accepts transcript messages with Content[] containing image blocks', () => {
     const inputWithImages = {
       ...validInput,
-      output: [
+      output: 'Generated chart:',
+      messages: [
         {
           role: 'assistant',
           content: [
@@ -238,7 +247,7 @@ describe('CodeGraderInputSchema', () => {
       ],
     };
     const result = CodeGraderInputSchema.parse(inputWithImages);
-    const content = result.output?.[0].content as { type: string; path?: string }[];
+    const content = result.messages?.[0].content as { type: string; path?: string }[];
     expect(content).toHaveLength(2);
     expect(content[1].type).toBe('image');
     expect(content[1].path).toBe('/workspace/chart.png');
diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts
index 890b80201..2ed471c26 100644
--- a/packages/eval/test/define-prompt-template.test.ts
+++ b/packages/eval/test/define-prompt-template.test.ts
@@ -2,6 +2,16 @@ import { describe, expect, it } from 'bun:test';
 
 import { PromptTemplateInputSchema } from '../src/schemas.js';
 
+const makeTrace = (overrides: Record<string, unknown> = {}) => ({
+  schemaVersion: 'agentv.trace.v1',
+  eventCount: 3,
+  toolCalls: { read: 2, write: 1 },
+  errorCount: 0,
+  messages: [],
+  events: [],
+  ...overrides,
+});
+
 describe('PromptTemplateInputSchema', () => {
   // Minimal valid input with all required fields
   const validInput = {
@@ -29,11 +39,7 @@ describe('PromptTemplateInputSchema', () => {
   it('accepts optional trace', () => {
     const inputWithTrace = {
       ...validInput,
-      trace: {
-        eventCount: 3,
-        toolCalls: { read: 2, write: 1 },
-        errorCount: 0,
-      },
+      trace: makeTrace(),
     };
     const result = PromptTemplateInputSchema.parse(inputWithTrace);
     expect(result.trace?.eventCount).toBe(3);
@@ -85,10 +91,11 @@ describe('PromptTemplateInputSchema', () => {
     expect(result.input[0].content).toBe('What is 2+2?');
   });
 
-  it('accepts optional output with toolCalls', () => {
+  it('accepts final output plus transcript messages with toolCalls', () => {
     const inputWithOutput = {
       ...validInput,
-      output: [
+      output: 'Reading file...',
+      messages: [
         {
           role: 'assistant',
           content: 'Reading file...',
@@ -97,21 +104,18 @@ describe('PromptTemplateInputSchema', () => {
       ],
     };
     const result = PromptTemplateInputSchema.parse(inputWithOutput);
-    expect(result.output?.[0].toolCalls?.[0].tool).toBe('read');
+    expect(result.output).toBe('Reading file...');
+    expect(result.messages?.[0].toolCalls?.[0].tool).toBe('read');
   });
 
   it('accepts full input with all fields', () => {
     const fullInput = {
       criteria: 'The answer should be 4',
       expectedOutput: [{ role: 'assistant', content: '4' }],
-      output: [{ role: 'assistant', content: 'The answer is 4' }],
+      output: 'The answer is 4',
       inputFiles: ['/path/to/input.txt'],
       input: [{ role: 'user', content: 'What is 2+2?' }],
-      trace: {
-        eventCount: 1,
-        toolCalls: {},
-        errorCount: 0,
-      },
+      trace: makeTrace({ eventCount: 1, toolCalls: {} }),
       config: { rubric: 'Check correctness' },
     };
     const result = PromptTemplateInputSchema.parse(fullInput);
diff --git a/packages/eval/test/deprecation.test.ts b/packages/eval/test/deprecation.test.ts
index e025fd973..6a63b03c7 100644
--- a/packages/eval/test/deprecation.test.ts
+++ b/packages/eval/test/deprecation.test.ts
@@ -23,15 +23,17 @@ describe('enrichInput — pass-through', () => {
     expect(result).toBe(input);
   });
 
-  it('structured fields (input, output, expectedOutput) remain Message[]', () => {
+  it('structured fields (input, messages, expectedOutput) remain transcript arrays', () => {
     const input = buildInput({
       input: [{ role: 'user', content: 'Hello' }],
-      output: [{ role: 'assistant', content: 'Hi' }],
+      output: 'Hi',
+      messages: [{ role: 'assistant', content: 'Hi' }],
       expectedOutput: [{ role: 'assistant', content: 'Hi there' }],
     });
     enrichInput(input);
     expect(Array.isArray(input.input)).toBe(true);
-    expect(Array.isArray(input.output)).toBe(true);
+    expect(input.output).toBe('Hi');
+    expect(Array.isArray(input.messages)).toBe(true);
     expect(Array.isArray(input.expectedOutput)).toBe(true);
   });
 });
diff --git a/packages/eval/test/file-backed-output.test.ts b/packages/eval/test/file-backed-output.test.ts
index 58e931f3e..27de99630 100644
--- a/packages/eval/test/file-backed-output.test.ts
+++ b/packages/eval/test/file-backed-output.test.ts
@@ -48,12 +48,9 @@ describe('Lazy file-backed output loading', () => {
   });
 
   it('lazily loads output from file when outputPath is set', () => {
-    const messages = [
-      { role: 'assistant', content: 'Hello from file' },
-      { role: 'user', content: 'Test' },
-    ];
+    const answer = 'Hello from file';
     const filePath = join(tmpDir, 'output.json');
-    writeFileSync(filePath, JSON.stringify(messages));
+    writeFileSync(filePath, JSON.stringify(answer));
 
     const input: CodeGraderInput = CodeGraderInputSchema.parse({
       criteria: 'test',
@@ -79,8 +76,7 @@ describe('Lazy file-backed output loading', () => {
 
     // First access triggers file read
     const output = input.output;
-    expect(output).toHaveLength(2);
-    expect(output?.[0].content).toBe('Hello from file');
+    expect(output).toBe('Hello from file');
 
     // Second access uses cache
     const output2 = input.output;
@@ -91,13 +87,12 @@ describe('Lazy file-backed output loading', () => {
     const input: CodeGraderInput = CodeGraderInputSchema.parse({
       criteria: 'test',
       expectedOutput: [],
-      output: [{ role: 'assistant', content: 'inline' }],
+      output: 'inline',
       inputFiles: [],
       input: [],
     });
 
     // No lazy loading needed — output is already present
-    expect(input.output).toHaveLength(1);
-    expect(input.output?.[0].content).toBe('inline');
+    expect(input.output).toBe('inline');
   });
 });

From 34dc821fbdb0c95cfa1f98f5bf6dccbdae58192d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 12 Jun 2026 11:37:22 +0200
Subject: [PATCH 7/7] fix(evaluation): pass final output to prompt templates

---
 .../evaluation/graders/prompt-resolution.ts   | 10 ++-
 .../graders/prompt-resolution.test.ts         | 73 +++++++++++++++++++
 .../core/test/evaluation/orchestrator.test.ts |  4 +-
 3 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts
index b31717047..2306c4b56 100644
--- a/packages/core/src/evaluation/graders/prompt-resolution.ts
+++ b/packages/core/src/evaluation/graders/prompt-resolution.ts
@@ -17,7 +17,7 @@ import { toSnakeCaseDeep } from '../case-conversion.js';
 import { readTextFile } from '../file-utils.js';
 import type { Message } from '../providers/types.js';
 import { VALID_TEMPLATE_VARIABLES } from '../template-variables.js';
-import type { TraceSummary } from '../trace.js';
+import type { Trace } from '../trace.js';
 import type { EvalTest, PromptScriptConfig } from '../types.js';
 import { executeScript } from './code-grader.js';
 
@@ -25,7 +25,7 @@ export interface ResolveCustomPromptContext {
   readonly evalCase: EvalTest;
   readonly candidate: string;
   readonly output?: readonly Message[];
-  readonly trace?: TraceSummary;
+  readonly trace?: Trace;
   readonly config?: Record<string, unknown>;
   readonly fileChanges?: string;
   readonly workspacePath?: string;
@@ -97,10 +97,14 @@ async function executePromptTemplate(
   config?: Record<string, unknown>,
   timeoutMs?: number,
 ): Promise<string> {
+  const messages = context.trace?.messages ?? context.output ?? [];
+
   const payload = {
     criteria: context.evalCase.criteria,
     expectedOutput: context.evalCase.expected_output,
-    output: context.output ?? null,
+    output: context.candidate,
+    answer: context.candidate,
+    messages,
     inputFiles: context.evalCase.file_paths,
     input: context.evalCase.input,
     metadata: context.evalCase.metadata ?? null,
diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
index 1c17cec2f..731a28dde 100644
--- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts
+++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
@@ -1,9 +1,16 @@
 import { describe, expect, it } from 'bun:test';
+import { mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import { fileURLToPath, pathToFileURL } from 'node:url';
 
 import {
   containsTemplateVariables,
   resolveCustomPrompt,
 } from '../../../src/evaluation/graders/prompt-resolution.js';
+import { buildTraceFromMessages } from '../../../src/evaluation/trace.js';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
 describe('containsTemplateVariables', () => {
   it('returns true for template with {{output}}', () => {
@@ -82,4 +89,70 @@ describe('resolveCustomPrompt', () => {
     });
     expect(result).toBeUndefined();
   });
+
+  it('passes final answer as output and transcript through messages/trace to executable prompts', async () => {
+    const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-contract-'));
+    const promptPath = path.join(tmpDir, 'prompt-template.ts');
+    const promptTemplateRuntime = pathToFileURL(
+      path.resolve(__dirname, '../../../../eval/src/prompt-template.ts'),
+    ).href;
+
+    writeFileSync(
+      promptPath,
+      `import { definePromptTemplate } from ${JSON.stringify(promptTemplateRuntime)};
+
+definePromptTemplate((ctx) => {
+  if (typeof ctx.output !== 'string') {
+    throw new Error('expected output to be the final answer string');
+  }
+  if (ctx.output !== 'Final answer') {
+    throw new Error('unexpected final answer: ' + ctx.output);
+  }
+  if (ctx.answer !== ctx.output) {
+    throw new Error('answer should mirror output');
+  }
+  if (!Array.isArray(ctx.messages) || ctx.messages.length < 2) {
+    throw new Error('expected transcript messages');
+  }
+  if (!ctx.messages.some((message) => message.role === 'assistant' && message.content === 'Trace assistant turn')) {
+    throw new Error('expected transcript message from trace');
+  }
+  if (!ctx.trace || !Array.isArray(ctx.trace.messages) || ctx.trace.messages.length !== ctx.messages.length) {
+    throw new Error('expected full trace with transcript messages');
+  }
+
+  return \`Final: \${ctx.output}; messages: \${ctx.messages.length}; trace: \${ctx.trace.messages.length}\`;
+});
+`,
+    );
+
+    const trace = buildTraceFromMessages({
+      input: [{ role: 'user', content: 'Question?' }],
+      output: [{ role: 'assistant', content: 'Trace assistant turn' }],
+      finalOutput: 'Final answer',
+      target: 'mock',
+      testId: 'prompt-contract',
+    });
+
+    const result = await resolveCustomPrompt(
+      {
+        resolvedPromptScript: [process.execPath, 'run', promptPath],
+      },
+      {
+        evalCase: {
+          id: 'prompt-contract',
+          input: [{ role: 'user', content: 'Question?' }],
+          expected_output: [{ role: 'assistant', content: 'Expected answer' }],
+          file_paths: [],
+          criteria: 'Check final answer.',
+        },
+        candidate: 'Final answer',
+        output: [{ role: 'assistant', content: 'Legacy transcript fallback' }],
+        trace,
+      },
+      5_000,
+    );
+
+    expect(result).toBe('Final: Final answer; messages: 2; trace: 2');
+  });
 });
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index ebdb2fcee..3511218f8 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -1153,7 +1153,7 @@ describe('runEvalCase trace integration', () => {
 const stdin = readFileSync(0, 'utf8');
 const input = JSON.parse(stdin);
 const question = (input.input || []).map((m) => String(m.content ?? '')).join('\\n');
-const answer = (input.output || []).map((m) => String(m.content ?? '')).join('\\n');
+const answer = String(input.output ?? '');
 const ref = (input.expected_output || []).map((m) => String(m.content ?? '')).join('\\n') || 'none';
 console.log(\`Question: \${question}
 Answer: \${answer}
@@ -1223,7 +1223,7 @@ Reference: \${ref}\`);
 const stdin = fs.readFileSync(0, 'utf8');
 const input = JSON.parse(stdin);
 const question = (input.input || []).map((m) => String(m.content || '')).join('\\n');
-const answer = (input.output || []).map((m) => String(m.content || '')).join('\\n');
+const answer = String(input.output || '');
 console.log('Question: ' + question + '\\nAnswer: ' + answer);
 `,
       );