Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,21 @@ The experiment config will specify:
- Evaluations (evals) that are used to grade the output of the model
- Treatments that specify the different conditions you would like to test (for
example, testing with an MCP server versus without)

Evals can also be defined inline in an experiment. Inline eval paths resolve
from the directory where the CLI is run, and use the same `eval.config.ts` and
`eval.test.ts` files as repository evals:

```ts
export const experiment: ExperimentConfig = {
name: 'Local project experiment',
description: 'Run an eval from the current project',
models: ['gpt-5.5'],
evals: [
{
path: './evals/local-button-eval',
},
],
treatments: [],
}
```
15 changes: 12 additions & 3 deletions packages/agent-eval/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {get as getEval} from '@primer/agent-evals'
import {list, find} from '@primer/agent-experiments'
import {ControlTreatment, type ExperimentConfig, type Model} from '@primer/agent-experiment'
import type {Treatment, TreatmentResult} from './treatment'
import {resolveExperimentEval} from './eval'
import {run} from './run'

const COPILOT_GITHUB_TOKEN = process.env.COPILOT_GITHUB_TOKEN
Expand Down Expand Up @@ -371,20 +372,28 @@ const results: Array<TreatmentResult> = []
for (const config of experimentConfigs) {
console.log('Running experiment:', config.name)

const evals = await Promise.all(
config.evals.map(evalConfig => {
return resolveExperimentEval(evalConfig, {
builtInEvalResolver: getEval,
})
}),
)

const treatments: Array<Treatment> = config.models.flatMap(model => {
return config.evals.flatMap(evalId => {
return evals.flatMap(evalConfig => {
return [
{
config: ControlTreatment,
eval: getEval(evalId),
eval: evalConfig,
experiment: config,
id: randomUUID(),
model,
},
...config.treatments.map(treatment => {
return {
config: treatment,
eval: getEval(evalId),
eval: evalConfig,
experiment: config,
id: randomUUID(),
model,
Expand Down
4 changes: 1 addition & 3 deletions packages/agent-eval/src/config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
type EvalConfig = {
prompt: string
}
import type {EvalConfig} from '@primer/agent-experiment'

function defineConfig(config: EvalConfig) {
return config
Expand Down
123 changes: 123 additions & 0 deletions packages/agent-eval/src/eval.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import fs from 'node:fs/promises'
import os from 'node:os'
import path from 'node:path'
import {afterEach, describe, expect, test} from 'vitest'
import type {ExperimentEvalConfig} from '@primer/agent-experiment'
import {resolveExperimentEval, type ResolvedEval} from './eval'

const temporaryDirectories: Array<string> = []

async function createTemporaryDirectory() {
const directory = await fs.mkdtemp(path.join(os.tmpdir(), 'agent-eval-'))
temporaryDirectories.push(directory)
return directory
}

afterEach(async () => {
await Promise.all(
temporaryDirectories.splice(0).map(directory => {
return fs.rm(directory, {recursive: true, force: true})
}),
)
})

describe(resolveExperimentEval, () => {
test('resolves built-in evals through the provided resolver', async () => {
const builtinEval: ResolvedEval = {
id: '001-agent-uses-button-from-primer',
directory: '/path/to/eval',
config: {
prompt: 'Use a Primer button',
},
testPath: '/path/to/eval/eval.test.ts',
}

await expect(
resolveExperimentEval('001-agent-uses-button-from-primer' as ExperimentEvalConfig, {
builtInEvalResolver(id) {
expect(id).toBe('001-agent-uses-button-from-primer')
return builtinEval
},
}),
).resolves.toBe(builtinEval)
})

test('resolves inline eval directories relative to the provided cwd', async () => {
const cwd = await createTemporaryDirectory()
const directory = path.join(cwd, 'evals', 'local-eval')
await fs.mkdir(directory, {recursive: true})
await fs.writeFile(path.join(directory, 'eval.config.mjs'), `export default {prompt: 'Use ignored config'}`)
await fs.writeFile(path.join(directory, 'eval.config.ts'), `export default {prompt: 'Update the local project'}`)
await fs.writeFile(path.join(directory, 'eval.test.ts'), '')

await expect(
resolveExperimentEval(
{
name: 'local-eval',
path: 'evals/local-eval',
},
{
builtInEvalResolver() {
throw new Error('Unexpected built-in eval lookup')
},
cwd,
},
),
).resolves.toEqual({
id: 'local-eval',
directory,
config: {
prompt: 'Update the local project',
},
testPath: path.join(directory, 'eval.test.ts'),
})
})

test('defaults inline eval names to the directory name', async () => {
const cwd = await createTemporaryDirectory()
const directory = path.join(cwd, 'evals', 'local-button-eval')
await fs.mkdir(directory, {recursive: true})
await fs.writeFile(path.join(directory, 'eval.config.ts'), `export default {prompt: 'Update the local project'}`)
await fs.writeFile(path.join(directory, 'eval.test.ts'), '')

await expect(
resolveExperimentEval(
{
path: './evals/local-button-eval',
},
{
builtInEvalResolver() {
throw new Error('Unexpected built-in eval lookup')
},
cwd,
},
),
).resolves.toMatchObject({
id: 'local-button-eval',
directory,
})
})

test('requires inline evals to use the default eval file structure', async () => {
const cwd = await createTemporaryDirectory()
const directory = path.join(cwd, 'fixtures', 'local-eval')
await fs.mkdir(directory, {recursive: true})
await fs.writeFile(path.join(directory, 'eval.config.ts'), `export default {prompt: 'Use default config'}`)
await fs.writeFile(path.join(directory, 'custom.test.ts'), '')

await expect(
resolveExperimentEval(
{
name: 'local-eval',
path: 'fixtures/local-eval',
},
{
builtInEvalResolver() {
throw new Error('Unexpected built-in eval lookup')
},
cwd,
},
),
).rejects.toThrow(`Eval "local-eval" test file was not found: ${path.join(directory, 'eval.test.ts')}`)
})
})
76 changes: 76 additions & 0 deletions packages/agent-eval/src/eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import fs from 'node:fs/promises'
import path from 'node:path'
import type {EvalConfig, ExperimentEvalConfig} from '@primer/agent-experiment'

type BuiltInEvalId = Extract<ExperimentEvalConfig, string>

type ResolvedEval = {
readonly id: string
readonly directory: string
readonly config: EvalConfig
readonly testPath: string
}

type ResolveEvalOptions = {
builtInEvalResolver: (id: BuiltInEvalId) => ResolvedEval
cwd?: string
}

function isEvalConfig(value: unknown): value is EvalConfig {
return (
value !== null &&
typeof value === 'object' &&
'prompt' in value &&
typeof (value as Record<string, unknown>).prompt === 'string'
)
}
Comment on lines +19 to +26

async function assertDirectory(directory: string, name: string) {
const stats = await fs.stat(directory).catch(() => undefined)
if (!stats?.isDirectory()) {
throw new Error(`Eval "${name}" directory was not found: ${directory}`)
}
}

async function assertFile(filepath: string, name: string) {
const stats = await fs.stat(filepath).catch(() => undefined)
if (!stats?.isFile()) {
throw new Error(`Eval "${name}" test file was not found: ${filepath}`)
}
}

async function loadEvalConfig(configPath: string, name: string): Promise<EvalConfig> {
const configModule = (await import(configPath)) as {default?: unknown}
if (!isEvalConfig(configModule.default)) {
throw new Error(`Eval "${name}" config must export a default config with a prompt`)
}
Comment on lines +28 to +46
return configModule.default
Comment on lines +42 to +47
}

async function resolveExperimentEval(
evalConfig: ExperimentEvalConfig,
options: ResolveEvalOptions,
): Promise<ResolvedEval> {
if (typeof evalConfig === 'string') {
return options.builtInEvalResolver(evalConfig)
}

const cwd = options.cwd ?? process.cwd()
const directory = path.resolve(cwd, evalConfig.path)
const name = evalConfig.name ?? path.basename(directory)
await assertDirectory(directory, name)

const config = await loadEvalConfig(path.resolve(directory, 'eval.config.ts'), name)
const testPath = path.resolve(directory, 'eval.test.ts')
await assertFile(testPath, name)

return {
id: name,
directory,
config,
testPath,
}
}

export {resolveExperimentEval}
export type {ResolvedEval}
4 changes: 2 additions & 2 deletions packages/agent-eval/src/treatment.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import type {Eval} from '@primer/agent-evals'
import type {ExperimentConfig, Model, TreatmentConfig} from '@primer/agent-experiment'
import type {ResolvedEval} from './eval'

type Treatment = {
config: TreatmentConfig
eval: Eval
eval: ResolvedEval
experiment: ExperimentConfig
id: string
model: Model
Expand Down
4 changes: 2 additions & 2 deletions packages/evals/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {data} from './generated/evals.ts'
import type {EvalId, Eval} from './generated/evals.ts'
import {data} from './generated/evals'
import type {EvalId, Eval} from './generated/evals'

function list(): ReadonlyArray<Eval> {
return data
Expand Down
15 changes: 13 additions & 2 deletions packages/experiment/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,22 @@ import type {EvalId} from '@primer/agent-evals'
import type {Sandbox} from '@primer/agent-sandbox'
import type {Model} from './model'

type EvalConfig = {
prompt: string
}

type InlineEvalConfig = {
name?: string
path: string
}

type ExperimentEvalConfig = EvalId | InlineEvalConfig

type ExperimentConfig = {
name: string
description: string
models: Array<Model>
evals: Array<EvalId>
evals: Array<ExperimentEvalConfig>
setup?: Setup
treatments: Array<TreatmentConfig>
}
Expand All @@ -18,4 +29,4 @@ type TreatmentConfig = {

type Setup = ({sandbox}: {sandbox: Sandbox}) => Promise<void>

export type {ExperimentConfig, TreatmentConfig}
export type {EvalConfig, ExperimentConfig, ExperimentEvalConfig, InlineEvalConfig, TreatmentConfig}
4 changes: 2 additions & 2 deletions packages/experiment/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import type {ExperimentConfig, TreatmentConfig} from './config.ts'
import type {EvalConfig, ExperimentConfig, ExperimentEvalConfig, InlineEvalConfig, TreatmentConfig} from './config'

const ControlTreatment: TreatmentConfig = {
name: 'Control',
}

export {ControlTreatment}
export type {ExperimentConfig, TreatmentConfig}
export type {EvalConfig, ExperimentConfig, ExperimentEvalConfig, InlineEvalConfig, TreatmentConfig}
export type {Model} from './model'
4 changes: 2 additions & 2 deletions packages/sandbox/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import Docker from 'dockerode'
import tarFs from 'tar-fs'
import type {Headers} from 'tar-fs'
import tarStream from 'tar-stream'
import {McpConfigFileSchema} from './mcp-config.ts'
import type {McpConfigFile, McpServerConfig} from './mcp-config.ts'
import {McpConfigFileSchema} from './mcp-config'
import type {McpConfigFile, McpServerConfig} from './mcp-config'

/**
* Working directory inside the container.
Expand Down