-
-
Notifications
You must be signed in to change notification settings - Fork 0
116 lines (109 loc) · 3.94 KB
/
Copy pathagent-eval-external.yml
File metadata and controls
116 lines (109 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Optional manual agent-eval on an in-repo indexed fixture (default: fixtures/minimal).
# Clone external trees into the checkout first; pass repo-relative fixture_root + matching scenarios/probes.
name: Agent eval (external)
on:
workflow_dispatch:
inputs:
fixture_root:
description: "Indexed project root — repo-relative path under the checkout (default fixtures/minimal)"
required: false
default: fixtures/minimal
mode:
description: "Harness mode — probe (queryRows) or live (MCP handlers)"
required: false
default: probe
type: choice
options:
- probe
- live
runs:
description: "Repeat count per probe"
required: false
default: "1"
scenarios:
description: "Golden scenarios JSON — repo-relative; empty = fixtures/golden/scenarios.json"
required: false
default: ""
probes:
description: "Probe definitions JSON — repo-relative; empty = scripts/agent-eval/scenarios.json"
required: false
default: ""
jobs:
agent-eval-external:
name: Agent eval (${{ inputs.mode }})
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup
uses: ./.github/actions/setup
- name: Resolve paths
id: paths
env:
INPUT_RUNS: ${{ inputs.runs }}
INPUT_FIXTURE_ROOT: ${{ inputs.fixture_root }}
INPUT_SCENARIOS: ${{ inputs.scenarios }}
INPUT_PROBES: ${{ inputs.probes }}
run: |
set -euo pipefail
RUNS="$INPUT_RUNS"
if ! [[ "$RUNS" =~ ^[1-9][0-9]*$ ]]; then
echo "runs must be a positive integer (got: $RUNS)" >&2
exit 1
fi
FIXTURE="$INPUT_FIXTURE_ROOT"
if [[ "$FIXTURE" == *".."* ]]; then
echo "fixture_root must not contain .." >&2
exit 1
fi
FIXTURE_ABS="${{ github.workspace }}/$FIXTURE"
if [[ ! -d "$FIXTURE_ABS" ]]; then
echo "fixture_root not found: $FIXTURE_ABS" >&2
exit 1
fi
echo "fixture=$FIXTURE_ABS" >> "$GITHUB_OUTPUT"
SCEN="$INPUT_SCENARIOS"
if [[ -n "$SCEN" ]]; then
if [[ "$SCEN" == *".."* ]]; then
echo "scenarios must not contain .." >&2
exit 1
fi
SCEN_ABS="${{ github.workspace }}/$SCEN"
if [[ ! -f "$SCEN_ABS" ]]; then
echo "scenarios file not found: $SCEN_ABS" >&2
exit 1
fi
echo "scenarios=$SCEN_ABS" >> "$GITHUB_OUTPUT"
fi
PROB="$INPUT_PROBES"
if [[ -n "$PROB" ]]; then
if [[ "$PROB" == *".."* ]]; then
echo "probes must not contain .." >&2
exit 1
fi
PROB_ABS="${{ github.workspace }}/$PROB"
if [[ ! -f "$PROB_ABS" ]]; then
echo "probes file not found: $PROB_ABS" >&2
exit 1
fi
echo "probes=$PROB_ABS" >> "$GITHUB_OUTPUT"
fi
- name: Golden index (fixtures/minimal only)
if: inputs.fixture_root == 'fixtures/minimal'
run: bun run test:golden
- name: Run agent-eval harness
env:
AGENT_EVAL_MODE: ${{ inputs.mode }}
AGENT_EVAL_FIXTURE_ROOT: ${{ steps.paths.outputs.fixture }}
AGENT_EVAL_RUNS: ${{ inputs.runs }}
AGENT_EVAL_PRINT_SUMMARY: "1"
AGENT_EVAL_SCENARIOS: ${{ steps.paths.outputs.scenarios }}
AGENT_EVAL_PROBES: ${{ steps.paths.outputs.probes }}
CODEMAP_MCP_TOOLS: ${{ inputs.mode == 'live' && 'query,query_recipe' || '' }}
run: bash scripts/agent-eval/run-arms.sh
- name: Upload comparison artifact
uses: actions/upload-artifact@v4
with:
name: agent-eval-comparison
path: .agent-eval/comparison.json
if-no-files-found: error