-
Notifications
You must be signed in to change notification settings - Fork 0
132 lines (112 loc) · 3.96 KB
/
eval.yml
File metadata and controls
132 lines (112 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
name: Agent Evaluation
on:
workflow_call:
secrets:
ANTHROPIC_API_KEY:
required: false
jobs:
evaluate:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
issues: write
steps:
- uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'pnpm'
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Build
run: pnpm build
- name: Download baseline results
if: github.event_name == 'pull_request'
uses: dawidd6/action-download-artifact@v2
with:
workflow: ci.yml
branch: ${{ github.base_ref }}
name: eval-results
path: baseline/
continue-on-error: true
- name: Run evaluation suite
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
mkdir -p results
node packages/cli/dist/cli.js eval \
trajectories/examples/*.jsonl \
--config trajectories/examples/config.yaml \
--output results/
- name: Run regression gates
if: github.event_name == 'pull_request' && hashFiles('baseline/') != ''
run: |
node packages/cli/dist/cli.js compare \
baseline/results.json \
results/results.json \
--format markdown \
--output results/comparison.md
- name: Check gates
run: |
node packages/cli/dist/cli.js gate \
results/results.json \
--preset standard \
--exit-code
- name: Upload evaluation results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results
path: results/
retention-days: 30
- name: Comment on PR
if: github.event_name == 'pull_request' && always()
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
// Read results
let comment = '## Agent Evaluation Results\n\n';
try {
const results = JSON.parse(fs.readFileSync('results/results.json', 'utf8'));
comment += `**Overall Score:** ${(results.overallMetrics.overallScore * 100).toFixed(1)}%\n`;
comment += `**Pass Rate:** ${results.summary.passRate.toFixed(1)}%\n`;
comment += `**Trajectories:** ${results.summary.totalTrajectories}\n\n`;
if (fs.existsSync('results/comparison.md')) {
comment += '### Comparison with Baseline\n\n';
comment += fs.readFileSync('results/comparison.md', 'utf8');
}
} catch (error) {
comment += '⚠️ Evaluation results could not be parsed.\n';
}
comment += '\n---\n*Generated by agent-eval-harness*';
// Find existing comment
const { data: comments } = await github.rest.issues.listComments({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
});
const botComment = comments.find(comment =>
comment.user.type === 'Bot' &&
comment.body.includes('Agent Evaluation Results')
);
if (botComment) {
await github.rest.issues.updateComment({
comment_id: botComment.id,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
} else {
await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
}