Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,6 @@ targets:
thinking: low
stream_log: raw

- name: pi-sdk-openai
provider: pi-coding-agent
subprovider: openai
base_url: ${{ OPENAI_ENDPOINT }}
api_key: ${{ OPENAI_API_KEY }}
model: gpt-5.5
grader_target: openai
thinking: low
stream_log: raw

- name: pi-azure
provider: pi-cli
Expand Down
17 changes: 0 additions & 17 deletions .ntm/palette.md

This file was deleted.

8 changes: 0 additions & 8 deletions .ntm/personas.toml

This file was deleted.

156 changes: 100 additions & 56 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ import {
type EvalTest,
type EvaluationResult,
type GraderResult,
type Message,
type TargetDefinition,
toTranscriptJsonLines,
type TraceSummary,
buildTraceFromMessages,
extractLastAssistantContent,
traceToTranscriptJsonLines,
} from '@agentv/core';
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
import { RESULT_INDEX_FILENAME } from './result-layout.js';
Expand Down Expand Up @@ -195,7 +199,10 @@ export interface IndexArtifactEntry {
readonly grading_path: string;
readonly timing_path: string;
readonly output_path?: string;
readonly answer_path?: string;
readonly transcript_path?: string;
readonly input_path?: string;
/** @deprecated Use output_path/answer_path for the final answer. */
readonly response_path?: string;
readonly task_dir?: string;
readonly eval_path?: string;
Expand Down Expand Up @@ -245,23 +252,8 @@ function countToolCalls(result: EvaluationResult): {
toolCalls: Record<string, number>;
total: number;
} {
const toolCalls: Record<string, number> = {};
let total = 0;

const trace = result.trace as
| { steps?: readonly { toolName?: string; type?: string }[] }
| undefined;

if (trace?.steps) {
for (const step of trace.steps) {
if (step.toolName || step.type === 'tool') {
const name = step.toolName ?? 'unknown';
toolCalls[name] = (toolCalls[name] ?? 0) + 1;
total += 1;
}
}
}

const toolCalls = { ...(result.trace?.toolCalls ?? {}) };
const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0);
return { toolCalls, total };
}

Expand Down Expand Up @@ -365,9 +357,8 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
workspace_changes: parseWorkspaceChanges(result.fileChanges),
conversation: result.conversationId
? {
turns: result.trace
? ((result.trace as { steps?: readonly unknown[] }).steps?.length ?? 0)
: 0,
turns:
result.trace?.messages.filter((message) => message.role === 'assistant').length ?? 0,
conversation_id: result.conversationId,
}
: undefined,
Expand Down Expand Up @@ -661,7 +652,10 @@ export function buildIndexArtifactEntry(
gradingPath: string;
timingPath: string;
outputPath?: string;
answerPath?: string;
transcriptPath?: string;
inputPath?: string;
responsePath?: string;
taskBundle?: MaterializedTaskBundlePaths;
},
): IndexArtifactEntry {
Expand Down Expand Up @@ -689,9 +683,18 @@ export function buildIndexArtifactEntry(
output_path: options.outputPath
? toRelativeArtifactPath(options.outputDir, options.outputPath)
: undefined,
answer_path: options.answerPath
? toRelativeArtifactPath(options.outputDir, options.answerPath)
: undefined,
transcript_path: options.transcriptPath
? toRelativeArtifactPath(options.outputDir, options.transcriptPath)
: undefined,
input_path: options.inputPath
? toRelativeArtifactPath(options.outputDir, options.inputPath)
: undefined,
response_path: options.responsePath
? toRelativeArtifactPath(options.outputDir, options.responsePath)
: undefined,
...buildTaskBundleIndexFields(options.outputDir, options.taskBundle),
metadata: result.metadata,
};
Expand All @@ -703,7 +706,8 @@ export function buildResultIndexArtifact(
): ResultIndexArtifact {
const artifactSubdir = buildArtifactSubdir(result);
const input = extractInput(result);
const hasResponse = Array.isArray(result.output) && result.output.length > 0;
const hasAnswer = result.output.length > 0;
const hasTranscript = result.trace.messages.length > 0 || result.trace.events.length > 0;

return {
timestamp: result.timestamp,
Expand All @@ -725,10 +729,12 @@ export function buildResultIndexArtifact(
grading_path: path.posix.join(artifactSubdir, 'grading.json'),
timing_path: path.posix.join(artifactSubdir, 'timing.json'),
input_path: input ? path.posix.join(artifactSubdir, 'input.md') : undefined,
output_path: hasResponse
? path.posix.join(artifactSubdir, 'outputs', 'response.md')
output_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
answer_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
transcript_path: hasTranscript
? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl')
: undefined,
response_path: hasResponse
response_path: hasAnswer
? path.posix.join(artifactSubdir, 'outputs', 'response.md')
: undefined,
...(taskBundle
Expand Down Expand Up @@ -756,6 +762,16 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr
await writeFile(filePath, content, 'utf8');
}

async function writeTranscriptJsonl(filePath: string, result: EvaluationResult): Promise<void> {
const lines = traceToTranscriptJsonLines(result.trace, {
testId: result.testId,
target: result.target,
});
const content =
lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join('\n')}\n` : '';
await writeFile(filePath, content, 'utf8');
}

function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
Expand Down Expand Up @@ -852,6 +868,7 @@ type ParsedEvaluationResult = Record<string, unknown> & {
assertions: EvaluationResult['assertions'];
target: string;
output: EvaluationResult['output'];
trace: EvaluationResult['trace'];
executionStatus: EvaluationResult['executionStatus'];
};

Expand All @@ -874,7 +891,7 @@ function isAssertionEntry(value: unknown): value is EvaluationResult['assertions
);
}

function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
function isOutputMessage(value: unknown): value is Message {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return false;
}
Expand All @@ -890,20 +907,56 @@ function isExecutionStatus(value: unknown): value is EvaluationResult['execution
);
}

function isTraceRecord(value: unknown): value is EvaluationResult['trace'] {
return (
!!value &&
typeof value === 'object' &&
!Array.isArray(value) &&
Array.isArray((value as { messages?: unknown }).messages) &&
Array.isArray((value as { events?: unknown }).events)
);
}

function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return undefined;
}

const result = value as Record<string, unknown>;
const legacyOutputMessages = Array.isArray(result.output)
? result.output.filter(isOutputMessage)
: undefined;
const output =
typeof result.output === 'string'
? result.output
: extractLastAssistantContent(legacyOutputMessages);
const legacySummary =
result.trace && typeof result.trace === 'object' && !Array.isArray(result.trace)
? (result.trace as TraceSummary)
: undefined;
const trace = isTraceRecord(result.trace)
? result.trace
: buildTraceFromMessages({
input: Array.isArray(result.input) ? (result.input as EvaluationResult['input']) : [],
output: legacyOutputMessages,
summary: legacySummary,
finalOutput: output,
tokenUsage: result.tokenUsage as EvaluationResult['tokenUsage'],
costUsd: typeof result.costUsd === 'number' ? result.costUsd : undefined,
durationMs: typeof result.durationMs === 'number' ? result.durationMs : undefined,
target: typeof result.target === 'string' ? result.target : undefined,
testId: typeof result.testId === 'string' ? result.testId : undefined,
});

return {
...result,
timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
testId: typeof result.testId === 'string' ? result.testId : 'unknown',
score: typeof result.score === 'number' ? result.score : 0,
assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
target: typeof result.target === 'string' ? result.target : 'unknown',
output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
output,
trace,
executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
};
}
Expand Down Expand Up @@ -959,23 +1012,10 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
const lines: string[] = [];

for (const result of results) {
const transcriptLines = toTranscriptJsonLines(
{
messages: [...(result.input ?? []), ...result.output],
source: {
provider: result.target,
sessionId: result.conversationId ?? result.testId,
startedAt: result.timestamp,
},
tokenUsage: result.tokenUsage,
durationMs: result.durationMs,
costUsd: result.costUsd,
},
{
testId: result.testId,
target: result.target,
},
);
const transcriptLines = traceToTranscriptJsonLines(result.trace, {
testId: result.testId,
target: result.target,
});

lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
}
Expand Down Expand Up @@ -1085,14 +1125,16 @@ export async function writePerTestArtifacts(
if (input) {
await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
}
if (result.output && result.output.length > 0) {
if (result.output.length > 0 || result.trace.messages.length > 0) {
const outputsDir = path.join(testDir, 'outputs');
await mkdir(outputsDir, { recursive: true });
await writeFile(
path.join(outputsDir, 'response.md'),
formatOutputMarkdown(result.output),
'utf8',
);
if (result.output.length > 0) {
await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
// Deprecated compatibility alias. New consumers should use answer.md
// for scored output or transcript.jsonl for the full execution record.
await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
}
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
}

const taskBundle = await materializeTaskBundleForResult({
Expand Down Expand Up @@ -1156,14 +1198,16 @@ export async function writeArtifactsFromResults(
await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
}

if (result.output && result.output.length > 0) {
if (result.output.length > 0 || result.trace.messages.length > 0) {
const outputsDir = path.join(testDir, 'outputs');
await mkdir(outputsDir, { recursive: true });
await writeFile(
path.join(outputsDir, 'response.md'),
formatOutputMarkdown(result.output),
'utf8',
);
if (result.output.length > 0) {
await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
// Deprecated compatibility alias. New consumers should use answer.md
// for scored output or transcript.jsonl for the full execution record.
await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
}
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
}

const taskBundle = await materializeTaskBundleForResult({
Expand Down
17 changes: 13 additions & 4 deletions apps/cli/src/commands/eval/commands/assert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import path from 'node:path';
import { command, option, optional, positional, string } from 'cmd-ts';
import fg from 'fast-glob';

import { executeScript } from '@agentv/core';
import { buildTraceFromMessages, executeScript } from '@agentv/core';

export const evalAssertCommand = command({
name: 'assert',
Expand Down Expand Up @@ -64,17 +64,26 @@ export const evalAssertCommand = command({

// Build payload matching CodeGrader's expected format (snake_case).
// Include all fields that defineCodeGrader validates as required.
const messages = [{ role: 'assistant' as const, content: resolvedOutput }];
const inputMessages = [{ role: 'user' as const, content: resolvedInput }];
const trace = buildTraceFromMessages({
input: inputMessages,
output: messages,
finalOutput: resolvedOutput,
});
const payload = JSON.stringify(
{
output: [{ role: 'assistant', content: resolvedOutput }],
input: [{ role: 'user', content: resolvedInput }],
output: resolvedOutput,
answer: resolvedOutput,
messages,
input: inputMessages,
question: resolvedInput,
criteria: '',
expected_output: [],
reference_answer: '',

input_files: [],
trace: null,
trace,
token_usage: null,
cost_usd: null,
duration_ms: null,
Expand Down
Loading
Loading