|
11 | 11 |
|
12 | 12 | permissions: |
13 | 13 | contents: read |
| 14 | + checks: write |
14 | 15 |
|
15 | 16 | jobs: |
16 | 17 | test: |
@@ -76,3 +77,111 @@ jobs: |
76 | 77 |
|
77 | 78 | - name: Validate Python playground examples |
78 | 79 | run: python scripts/validate-python-examples.py playground/ |
| 80 | + |
| 81 | + benchmark: |
| 82 | + # Run the OpenEvolve benchmark for autoloop *-evolve PRs so the autoloop |
| 83 | + # agent can read a real fitness number from CI (see .autoloop/strategies/ |
| 84 | + # openevolve/strategy.md, Step 6.5). The sandbox the agent runs in cannot |
| 85 | + # install bun reliably and so cannot measure fitness itself. |
| 86 | + name: OpenEvolve benchmark |
| 87 | + if: | |
| 88 | + (github.event_name == 'pull_request' && startsWith(github.head_ref, 'autoloop/') && contains(github.head_ref, '-evolve')) |
| 89 | + || (github.event_name == 'push' && startsWith(github.ref_name, 'autoloop/') && contains(github.ref_name, '-evolve')) |
| 90 | + runs-on: ubuntu-latest |
| 91 | + permissions: |
| 92 | + contents: read |
| 93 | + checks: write |
| 94 | + steps: |
| 95 | + - uses: actions/checkout@v4 |
| 96 | + |
| 97 | + - name: Setup Bun |
| 98 | + uses: oven-sh/setup-bun@v2 |
| 99 | + with: |
| 100 | + bun-version: latest |
| 101 | + |
| 102 | + - name: Install dependencies |
| 103 | + run: bun install |
| 104 | + |
| 105 | + - name: Setup Python |
| 106 | + uses: actions/setup-python@v5 |
| 107 | + with: |
| 108 | + python-version: "3.12" |
| 109 | + |
| 110 | + - name: Install Python dependencies |
| 111 | + run: pip install pandas numpy |
| 112 | + |
| 113 | + - name: Resolve program directory |
| 114 | + id: program |
| 115 | + run: | |
| 116 | + # Resolve the program directory from the branch name: |
| 117 | + # autoloop/<program-name> → .autoloop/programs/<program-name>/ |
| 118 | + BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" |
| 119 | + PROGRAM="${BRANCH#autoloop/}" |
| 120 | + PROGRAM_DIR=".autoloop/programs/${PROGRAM}" |
| 121 | + echo "program=${PROGRAM}" >> "$GITHUB_OUTPUT" |
| 122 | + echo "program_dir=${PROGRAM_DIR}" >> "$GITHUB_OUTPUT" |
| 123 | + if [ -x "${PROGRAM_DIR}/evaluate.sh" ]; then |
| 124 | + echo "has_evaluator=true" >> "$GITHUB_OUTPUT" |
| 125 | + else |
| 126 | + echo "No evaluate.sh for program '${PROGRAM}' — skipping benchmark." >&2 |
| 127 | + echo "has_evaluator=false" >> "$GITHUB_OUTPUT" |
| 128 | + fi |
| 129 | +
|
| 130 | + - name: Run OpenEvolve benchmark |
| 131 | + id: bench |
| 132 | + if: steps.program.outputs.has_evaluator == 'true' |
| 133 | + run: | |
| 134 | + PROGRAM_DIR="${{ steps.program.outputs.program_dir }}" |
| 135 | + # evaluate.sh is contracted to always exit 0 and encode failures in |
| 136 | + # the JSON, but we tolerate non-zero exits anyway and fall back to a |
| 137 | + # null fitness so the check-run still gets created. |
| 138 | + set +e |
| 139 | + bash "${PROGRAM_DIR}/evaluate.sh" >/tmp/bench-result.json 2>/tmp/bench-stderr |
| 140 | + rc=$? |
| 141 | + set -e |
| 142 | + if [ ! -s /tmp/bench-result.json ]; then |
| 143 | + echo "{\"fitness\": null, \"rejected_reason\": \"evaluator produced no output (exit ${rc})\"}" \ |
| 144 | + > /tmp/bench-result.json |
| 145 | + fi |
| 146 | + cat /tmp/bench-result.json |
| 147 | + fitness=$(jq -r '.fitness // "null"' /tmp/bench-result.json) |
| 148 | + echo "fitness=${fitness}" >> "$GITHUB_OUTPUT" |
| 149 | + # Compact JSON for the check-run output below. |
| 150 | + echo "result_json=$(jq -c . /tmp/bench-result.json)" >> "$GITHUB_OUTPUT" |
| 151 | +
|
| 152 | + - name: Upload benchmark result |
| 153 | + if: steps.program.outputs.has_evaluator == 'true' |
| 154 | + uses: actions/upload-artifact@v4 |
| 155 | + with: |
| 156 | + name: benchmark-result |
| 157 | + path: /tmp/bench-result.json |
| 158 | + |
| 159 | + - name: Attach fitness as check-run |
| 160 | + if: steps.program.outputs.has_evaluator == 'true' |
| 161 | + uses: actions/github-script@v7 |
| 162 | + env: |
| 163 | + FITNESS: ${{ steps.bench.outputs.fitness }} |
| 164 | + RESULT_JSON: ${{ steps.bench.outputs.result_json }} |
| 165 | + with: |
| 166 | + script: | |
| 167 | + const fitness = process.env.FITNESS; |
| 168 | + let result; |
| 169 | + try { |
| 170 | + result = JSON.parse(process.env.RESULT_JSON); |
| 171 | + } catch { |
| 172 | + result = { raw: process.env.RESULT_JSON }; |
| 173 | + } |
| 174 | + const sha = context.payload.pull_request |
| 175 | + ? context.payload.pull_request.head.sha |
| 176 | + : context.sha; |
| 177 | + await github.rest.checks.create({ |
| 178 | + ...context.repo, |
| 179 | + name: "OpenEvolve benchmark", |
| 180 | + head_sha: sha, |
| 181 | + status: "completed", |
| 182 | + conclusion: fitness === "null" ? "neutral" : "success", |
| 183 | + output: { |
| 184 | + title: `fitness=${fitness}`, |
| 185 | + summary: "```json\n" + JSON.stringify(result, null, 2) + "\n```", |
| 186 | + }, |
| 187 | + }); |
0 commit comments