bench-sweep #55
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: bench-sweep | |
| # Multi-runner parallel sweep across method × budget × test_set. | |
| # | |
| # Two modes (select at dispatch): | |
| # smoke — GitHub-hosted ubuntu-latest, 1 cell (ppr×64000×swebench_verified), | |
| # 5 instances, no Hetzner provisioning. Quick sanity check. | |
| # full — Hetzner CCX63 (4 × self-hosted runners), complete 5×5×3 matrix, | |
| # baked repo cache. Production sweep. | |
| # | |
| # Trigger via workflow_dispatch. Results are committed to bench-results/sweep. | |
| 'on': | |
| workflow_dispatch: | |
| inputs: | |
| mode: | |
| description: 'smoke = GH Actions, 5 instances | full = Hetzner CCX63, complete sweep' | |
| required: true | |
| default: smoke | |
| type: choice | |
| options: [smoke, full] | |
| concurrency: | |
| group: bench-sweep | |
| cancel-in-progress: false | |
| env: | |
| REGISTRY: ghcr.io | |
| IMAGE_REF: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest | |
| MANIFESTS_SUBDIR: v1 | |
| TIMEOUT_PER_INSTANCE: '600' | |
| TAU: '0.12' | |
| CORE_BUDGET_FRACTION: '0.5' | |
| RESULTS_BRANCH: bench-results/sweep | |
| SERVER_TAG: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| jobs: | |
| # ============================================================================ | |
| # 1. Compute a stable sweep_id used as the directory name for raw artifacts | |
| # ============================================================================ | |
| prep: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| outputs: | |
| sweep_id: ${{ steps.id.outputs.sweep_id }} | |
| ts: ${{ steps.id.outputs.ts }} | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - id: id | |
| run: | | |
| ts=$(date -u +%Y%m%dT%H%M%SZ) | |
| sha=$(git rev-parse --short HEAD) | |
| echo "sweep_id=sweep-${ts}-${sha}" >> "$GITHUB_OUTPUT" | |
| echo "ts=${ts}" >> "$GITHUB_OUTPUT" | |
| # ============================================================================ | |
| # 2. Provision Hetzner CCX63 — full mode only | |
| # ============================================================================ | |
| provision: | |
| needs: prep | |
| if: inputs.mode == 'full' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| outputs: | |
| server_id: ${{ steps.create.outputs.server_id }} | |
| server_ip: ${{ steps.create.outputs.server_ip }} | |
| steps: | |
| - name: Create Hetzner CCX63 with 4 self-hosted runners | |
| id: create | |
| env: | |
| HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| REPO: ${{ github.repository }} | |
| SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| BENCH_SSH_PUBLIC_KEY: ${{ secrets.BENCH_SSH_PUBLIC_KEY }} | |
| run: | | |
| set -euo pipefail | |
| REG_TOKEN=$(curl -sf -X POST \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners/registration-token" \ | |
| | jq -r '.token') | |
| RUNNER_VER=$(curl -sf \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/actions/runner/releases/latest" \ | |
| | jq -r '.tag_name[1:]') | |
| REPO_URL="https://github.com/${REPO}" | |
| cat > /tmp/cloud-init.sh << ENDINIT | |
| #!/bin/bash | |
| set -euo pipefail | |
| export DEBIAN_FRONTEND=noninteractive | |
| apt-get update -y | |
| apt-get install -y docker.io curl jq | |
| systemctl enable docker && systemctl start docker | |
| mkdir -p /root/.ssh && chmod 700 /root/.ssh | |
| echo "${BENCH_SSH_PUBLIC_KEY}" >> /root/.ssh/authorized_keys | |
| chmod 600 /root/.ssh/authorized_keys | |
| chage -M -1 root | |
| useradd -m -s /bin/bash github | |
| usermod -aG docker github | |
| echo "${GH_TOKEN}" | docker login ghcr.io -u nikolay-e --password-stdin | |
| docker pull ghcr.io/nikolay-e/treemapper-bench:latest | |
| nohup bash -c ' | |
| mkdir -p /data/bench_repos | |
| BAKE_LOG=/data/bench_repos/bake.log | |
| if docker run --rm \ | |
| --entrypoint python3 \ | |
| -v /data/bench_repos:/cache/contextbench_repos \ | |
| -e BAKE_PARALLELISM=16 \ | |
| ghcr.io/nikolay-e/treemapper-bench:latest \ | |
| /app/scripts/bake_bench_cache.py /cache/contextbench_repos \ | |
| >> "\${BAKE_LOG}" 2>&1; then | |
| touch /data/bench_repos/.ready | |
| echo "[bake] OK at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}" | |
| else | |
| rc=\$? | |
| touch /data/bench_repos/.bake_failed | |
| echo "[bake] FAILED rc=\${rc} at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}" | |
| fi | |
| ' & | |
| curl -sL -o /tmp/runner.tar.gz \ | |
| "https://github.com/actions/runner/releases/download/v${RUNNER_VER}/actions-runner-linux-x64-${RUNNER_VER}.tar.gz" | |
| for N in 1 2 3 4; do | |
| mkdir -p "/home/github/runner-\${N}" | |
| tar xzf /tmp/runner.tar.gz -C "/home/github/runner-\${N}" | |
| chown -R github:github "/home/github/runner-\${N}" | |
| sudo -u github bash -c " | |
| cd /home/github/runner-\${N} && | |
| ./config.sh \ | |
| --url '${REPO_URL}' \ | |
| --token '${REG_TOKEN}' \ | |
| --name '${SERVER_NAME}-\${N}' \ | |
| --labels 'self-hosted,large' \ | |
| --unattended \ | |
| --replace" | |
| nohup sudo -u github bash -c \ | |
| "cd /home/github/runner-\${N} && ./run.sh" \ | |
| >> "/home/github/runner-\${N}/runner.log" 2>&1 & | |
| done | |
| ENDINIT | |
| RESPONSE=$(curl -sf -X POST "https://api.hetzner.cloud/v1/servers" \ | |
| -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ | |
| -H "Content-Type: application/json" \ | |
| -d "{ | |
| \"name\": \"${SERVER_NAME}\", | |
| \"server_type\": \"ccx63\", | |
| \"image\": \"ubuntu-24.04\", | |
| \"location\": \"nbg1\", | |
| \"user_data\": $(python3 -c "import json,sys; print(json.dumps(open('/tmp/cloud-init.sh').read()))") | |
| }") | |
| SERVER_ID=$(echo "${RESPONSE}" | jq -r '.server.id') | |
| SERVER_IP=$(echo "${RESPONSE}" | jq -r '.server.public_net.ipv4.ip') | |
| echo "server_id=${SERVER_ID}" >> "${GITHUB_OUTPUT}" | |
| echo "server_ip=${SERVER_IP}" >> "${GITHUB_OUTPUT}" | |
| echo "${SERVER_ID}" > server_id.txt | |
| echo "Provisioned Hetzner CCX63 server_id=${SERVER_ID} ip=${SERVER_IP} name=${SERVER_NAME}" | |
| SSHCMD="ssh root@${SERVER_IP} -i <(security find-generic-password -s treemapper-bench-ssh-key -a \$USER -w)" | |
| echo "SSH: ${SSHCMD}" | |
| - name: Upload server_id for cleanup workflow | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: hetzner-server-id | |
| path: server_id.txt | |
| retention-days: 7 | |
| - name: Wait for all 4 runners to come online | |
| env: | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| REPO: ${{ github.repository }} | |
| SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| run: | | |
| set -euo pipefail | |
| echo "Polling for 4 runners prefixed '${SERVER_NAME}'..." | |
| DEADLINE=$((SECONDS + 900)) | |
| while [[ ${SECONDS} -lt ${DEADLINE} ]]; do | |
| ONLINE=$(curl -sf \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \ | |
| | jq --arg pfx "${SERVER_NAME}" \ | |
| '[.runners[] | select(.name | startswith($pfx)) | select(.status == "online")] | length') | |
| echo " Online runners: ${ONLINE}/4" | |
| [[ "${ONLINE}" -ge 4 ]] && echo "All 4 runners ready." && exit 0 | |
| sleep 30 | |
| done | |
| echo "ERROR: runners did not come online within 15 minutes." >&2 | |
| exit 1 | |
| # ============================================================================ | |
| # 3a. Smoke sweep — ubuntu-latest, minimal matrix, 5 instances, no bake cache | |
| # ============================================================================ | |
| sweep-smoke: | |
| needs: [prep] | |
| if: inputs.mode == 'smoke' | |
| permissions: | |
| contents: read | |
| packages: read | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| method: [ppr, ego, bm25, aider] | |
| budget: [64000] | |
| test_set: [swebench_verified] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| container: | |
| image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest | |
| credentials: | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| options: >- | |
| --cpus 2 | |
| --memory 6g | |
| --memory-swap 6g | |
| env: | |
| CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }} | |
| CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }} | |
| MANIFESTS_DIR: benchmarks/manifests/v1 | |
| INPUT_TAU: '0.12' | |
| INPUT_CBF: '0.5' | |
| INPUT_TIMEOUT: '120' | |
| INPUT_MANIFESTS_SUBDIR: v1 | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Cell start metadata | |
| run: | | |
| mkdir -p "${CELL_DIR}" | |
| cat > "${CELL_DIR}/metadata.json" <<JSON | |
| { | |
| "sweep_id": "${{ needs.prep.outputs.sweep_id }}", | |
| "mode": "smoke", | |
| "cell": { | |
| "method": "${{ matrix.method }}", | |
| "budget": ${{ matrix.budget }}, | |
| "test_set": "${{ matrix.test_set }}", | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF} | |
| }, | |
| "git": { | |
| "sha": "${{ github.sha }}", | |
| "ref": "${{ github.ref }}", | |
| "actor": "${{ github.actor }}" | |
| }, | |
| "started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "runner": { | |
| "os": "${RUNNER_OS}", | |
| "arch": "${RUNNER_ARCH}", | |
| "name": "${RUNNER_NAME}", | |
| "image": "${IMAGE_REF}" | |
| }, | |
| "stage": "started" | |
| } | |
| JSON | |
| - name: Build per-cell manifest dir | |
| run: | | |
| mkdir -p /tmp/manifest_one | |
| cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/ | |
| - name: Resolve baseline + scoring | |
| id: cfg | |
| run: | | |
| method="${{ matrix.method }}" | |
| case "$method" in | |
| ppr|ego) | |
| echo "baseline=diffctx" >> "$GITHUB_OUTPUT" | |
| echo "scoring=$method" >> "$GITHUB_OUTPUT" | |
| ;; | |
| bm25) | |
| echo "baseline=bm25" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| aider) | |
| echo "baseline=aider_fair" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| esac | |
| - name: Build winner.json | |
| run: | | |
| cat > /tmp/winner.json <<JSON | |
| { | |
| "winner": { | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF}, | |
| "budget": ${{ matrix.budget }}, | |
| "scoring": "${SCORING}", | |
| "extra_env": {} | |
| } | |
| } | |
| JSON | |
| env: | |
| SCORING: ${{ steps.cfg.outputs.scoring }} | |
| - name: Install uv (required by aider baseline) | |
| run: pip install uv | |
| - name: Run cell (smoke — 5 instances) | |
| timeout-minutes: 50 | |
| shell: bash | |
| run: | | |
| set -o pipefail | |
| python -m benchmarks.run_final_eval \ | |
| --baseline "${BASELINE}" \ | |
| --winner /tmp/winner.json \ | |
| --manifests-dir /tmp/manifest_one \ | |
| --workers 1 \ | |
| --limit 5 \ | |
| --timeout-per-instance "${INPUT_TIMEOUT}" \ | |
| --min-memory-gb 2 \ | |
| --min-disk-gb 5 \ | |
| --out "${CELL_DIR}" 2>&1 | tee "${CELL_DIR}/run.log" | |
| env: | |
| BASELINE: ${{ steps.cfg.outputs.baseline }} | |
| - name: Per-instance metric summary (always) | |
| if: always() | |
| run: | | |
| ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl" | |
| if [ -f "${ckpt}" ]; then | |
| python - <<PY > "${CELL_DIR}/cell_summary.json" | |
| import json, statistics, sys | |
| rows = [] | |
| with open("${ckpt}") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| rows.append(json.loads(line)) | |
| n = len(rows) | |
| if n == 0: | |
| print(json.dumps({"n": 0})) | |
| else: | |
| recall = [r.get("file_recall", 0) for r in rows] | |
| precision = [r.get("file_precision", 0) for r in rows] | |
| elapsed = [r.get("elapsed_seconds", 0) for r in rows] | |
| tokens = [r.get("used_tokens", 0) for r in rows] | |
| # fragment_recall / line_f1 are populated only when the dataset | |
| # ships gold_fragments (ContextBench, PolyBench when CST nodes | |
| # exist); skip nulls so the aggregator gets honest means rather | |
| # than zero-padded ones. | |
| frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None] | |
| line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None] | |
| ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok") | |
| statuses = {} | |
| errors = {} | |
| for r in rows: | |
| s = (r.get("extra") or {}).get("status", "missing") | |
| statuses[s] = statuses.get(s, 0) + 1 | |
| if s not in ("ok",): | |
| err = str((r.get("extra") or {}).get("error", "")) | |
| if err: | |
| errors[err] = errors.get(err, 0) + 1 | |
| out = { | |
| "n": n, | |
| "ok": ok, | |
| "ok_pct": 100.0 * ok / n if n else 0.0, | |
| "statuses": statuses, | |
| "errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]), | |
| "file_recall": { | |
| "mean": statistics.fmean(recall), | |
| "median": statistics.median(recall), | |
| }, | |
| "file_precision": {"mean": statistics.fmean(precision)}, | |
| "fragment_recall": ( | |
| {"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)} | |
| if frag_recall else None | |
| ), | |
| "line_f1": ( | |
| {"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)} | |
| if line_f1 else None | |
| ), | |
| "elapsed_seconds": {"mean": statistics.fmean(elapsed), "total": sum(elapsed)}, | |
| "used_tokens": {"mean": statistics.fmean(tokens)}, | |
| } | |
| print(json.dumps(out, indent=2)) | |
| if errors: | |
| print("\n=== ERROR BREAKDOWN ===", file=sys.stderr) | |
| for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]): | |
| print(f" [{cnt}x] {msg[:200]}", file=sys.stderr) | |
| PY | |
| cat "${CELL_DIR}/cell_summary.json" | |
| else | |
| printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \ | |
| > "${CELL_DIR}/cell_summary.json" | |
| fi | |
| - name: Append cell-end metadata | |
| if: always() | |
| run: | | |
| end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" | |
| if command -v jq >/dev/null 2>&1; then | |
| tmp=$(mktemp) | |
| jq --arg end "${end_ts}" --arg status "${{ job.status }}" \ | |
| '. + {ended_at_utc: $end, exit_status: $status}' \ | |
| "${CELL_DIR}/metadata.json" > "${tmp}" \ | |
| && mv "${tmp}" "${CELL_DIR}/metadata.json" | |
| fi | |
| - name: Upload cell artifact | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: cell-${{ matrix.method }}-b${{ matrix.budget }}-${{ matrix.test_set }} | |
| path: ${{ env.CELL_DIR }}/ | |
| retention-days: 30 | |
| if-no-files-found: warn | |
| # ============================================================================ | |
| # 3b. Full sweep — Hetzner CCX63, complete 5×5×3 matrix, baked repo cache | |
| # ============================================================================ | |
| sweep-full: | |
| needs: [prep, provision] | |
| if: inputs.mode == 'full' | |
| permissions: | |
| contents: read | |
| packages: read | |
| strategy: | |
| fail-fast: false | |
| # 4 self-hosted runners on the CCX63 host; the queue can only drain at | |
| # that rate anyway, so cap the matrix-level parallelism to match. | |
| max-parallel: 4 | |
| matrix: | |
| method: [ppr, ego, bm25, aider] | |
| # B=0 is a recall-floor sanity bound (must report ≈ 0); B=-1 is the | |
| # unlimited-budget ceiling. The 5 paying budgets {8k, 16k, 32k, 64k, 128k} | |
| # form the budget curve referenced in STATS_PLAN.md. | |
| budget: [-1, 0, 8000, 16000, 32000, 64000, 128000] | |
| # `depth` is the EGO graph traversal radius; only meaningful for | |
| # method=ego. PPR uses alpha (not depth), BM25 has no graph, | |
| # Aider is its own subprocess. Sentinel -1 means "depth does not | |
| # apply"; non-EGO methods run only with depth=-1. | |
| depth: [-1, 0, 1, 2, 3, 4] | |
| test_set: [contextbench_verified, polybench500, swebench_verified] | |
| exclude: | |
| - {method: bm25, budget: -1} | |
| - {method: aider, budget: -1} | |
| # Non-EGO methods: only the depth=-1 cell. | |
| - {method: ppr, depth: 0} | |
| - {method: ppr, depth: 1} | |
| - {method: ppr, depth: 2} | |
| - {method: ppr, depth: 3} | |
| - {method: ppr, depth: 4} | |
| - {method: bm25, depth: 0} | |
| - {method: bm25, depth: 1} | |
| - {method: bm25, depth: 2} | |
| - {method: bm25, depth: 3} | |
| - {method: bm25, depth: 4} | |
| - {method: aider, depth: 0} | |
| - {method: aider, depth: 1} | |
| - {method: aider, depth: 2} | |
| - {method: aider, depth: 3} | |
| - {method: aider, depth: 4} | |
| - {method: ego, depth: -1} | |
| runs-on: [self-hosted, large] | |
| timeout-minutes: 350 | |
| container: | |
| image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest | |
| credentials: | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| options: >- | |
| --cpus 12 | |
| --memory 44g | |
| --memory-swap 44g | |
| --shm-size 8g | |
| --pids-limit -1 | |
| --ulimit nofile=1048576:1048576 | |
| --ulimit nproc=65535:65535 | |
| -v /data/bench_repos:/cache/contextbench_repos | |
| env: | |
| CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_L${{ matrix.depth }}_${{ matrix.test_set }} | |
| CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_L${{ matrix.depth }}_${{ matrix.test_set }} | |
| MANIFESTS_DIR: benchmarks/manifests/v1 | |
| INPUT_TAU: '0.12' | |
| INPUT_CBF: '0.5' | |
| INPUT_TIMEOUT: '600' | |
| INPUT_MANIFESTS_SUBDIR: v1 | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Cell start metadata (write FIRST, before any failure point) | |
| run: | | |
| mkdir -p "${CELL_DIR}" | |
| cat > "${CELL_DIR}/metadata.json" <<JSON | |
| { | |
| "sweep_id": "${{ needs.prep.outputs.sweep_id }}", | |
| "mode": "full", | |
| "cell": { | |
| "method": "${{ matrix.method }}", | |
| "budget": ${{ matrix.budget }}, | |
| "depth": ${{ matrix.depth }}, | |
| "test_set": "${{ matrix.test_set }}", | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF} | |
| }, | |
| "git": { | |
| "sha": "${{ github.sha }}", | |
| "ref": "${{ github.ref }}", | |
| "actor": "${{ github.actor }}" | |
| }, | |
| "started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "runner": { | |
| "os": "${RUNNER_OS}", | |
| "arch": "${RUNNER_ARCH}", | |
| "name": "${RUNNER_NAME}", | |
| "image": "${IMAGE_REF}" | |
| }, | |
| "stage": "started" | |
| } | |
| JSON | |
| - name: Wait for shared repo cache | |
| run: | | |
| while true; do | |
| if [ -f /cache/contextbench_repos/.bake_failed ]; then | |
| echo "=== BAKE FAILED — full log ===" | |
| cat /cache/contextbench_repos/bake.log || echo "(bake.log not found)" | |
| echo "=== end of bake.log ===" | |
| exit 1 | |
| fi | |
| if [ -f /cache/contextbench_repos/.ready ]; then | |
| echo "Repo cache ready." | |
| break | |
| fi | |
| echo "Waiting for repo cache extraction to complete..." | |
| sleep 15 | |
| done | |
| - name: Reclaim disk — purge own runner's worktrees, prune bare caches | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| base=/cache/contextbench_repos | |
| # `_ensure_worker_state` names worktrees `worktrees/w<runner>_<pid>/` | |
| # using `RUNNER_NAME` (with `-` and space mapped to `_`). Four | |
| # self-hosted runners share `/data/bench_repos` on the CCX63 host, | |
| # so a blanket `rm -rf worktrees/` would corrupt concurrent cells | |
| # on the other runners. Restrict to dirs prefixed with our own | |
| # runner name so cleanup is single-runner-scoped. | |
| runner_slug="$(printf '%s' "${RUNNER_NAME:-unknown}" | tr ' -' '__')" | |
| echo "--- Disk before cleanup ---" | |
| df -h "${base}" || true | |
| du -sh "${base}/worktrees" 2>/dev/null || echo " (no worktrees dir)" | |
| echo " runner_slug=${runner_slug}" | |
| # 1. Drop only THIS runner's accumulated worktree dirs from prior | |
| # cells. Concurrent cells on other runners (different PIDs and | |
| # different RUNNER_NAME prefixes) are untouched. | |
| if [ -d "${base}/worktrees" ]; then | |
| find "${base}/worktrees" -mindepth 1 -maxdepth 1 -type d \ | |
| -name "w${runner_slug}_*" -prune -exec rm -rf {} + 2>/dev/null || true | |
| fi | |
| # 2. Prune dangling worktree registrations on every bare cache. | |
| # Idempotent and safe even when other runners have live | |
| # registrations: `git worktree prune` only drops entries whose | |
| # target directory is missing. | |
| for repo_cache in "${base}"/*/; do | |
| if [ -d "${repo_cache}/.git" ] || [ -f "${repo_cache}/HEAD" ]; then | |
| git -C "${repo_cache}" worktree prune --expire=now 2>/dev/null || true | |
| fi | |
| done | |
| # 3. Sanity check: a single cell on a large repo (angular, AutoGPT, | |
| # transformers) can grow worktrees to 50-100 GB. With 4 cells | |
| # running concurrently on the same physical disk we want at | |
| # least ~150 GB headroom (4 cells x ~35 GB peak typical, plus | |
| # swap and other infrastructure). Run #49 hit ENOSPC at ~310 GB | |
| # free at cell start, so 150 GB is the conservative floor. | |
| avail_kb=$(df --output=avail "${base}" | tail -1 | tr -d ' ') | |
| avail_gb=$((avail_kb / 1024 / 1024)) | |
| echo "--- Disk after cleanup: ${avail_gb} GB available ---" | |
| df -h "${base}" | |
| if [ "${avail_gb}" -lt 150 ]; then | |
| msg="Less than 150 GB free on /cache/contextbench_repos after" | |
| msg="${msg} cleanup. Other runners may be holding live worktrees;" | |
| msg="${msg} retrying in 60s in case they are about to release." | |
| echo "::warning::${msg}" >&2 | |
| sleep 60 | |
| avail_kb=$(df --output=avail "${base}" | tail -1 | tr -d ' ') | |
| avail_gb=$((avail_kb / 1024 / 1024)) | |
| echo "--- Disk after retry wait: ${avail_gb} GB ---" | |
| if [ "${avail_gb}" -lt 100 ]; then | |
| echo "::error::Disk still tight (${avail_gb} GB); aborting cell." >&2 | |
| exit 1 | |
| fi | |
| fi | |
| - name: Validate bake manifest and purge known-bad repos | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| MANIFEST=/cache/contextbench_repos/.bake_manifest.json | |
| if [ ! -f "${MANIFEST}" ]; then | |
| echo "WARN: no .bake_manifest.json found — proceeding without preflight purge" | |
| exit 0 | |
| fi | |
| python3 - <<'PY' | |
| import json, shutil | |
| from pathlib import Path | |
| base = Path("/cache/contextbench_repos") | |
| manifest = json.loads((base / ".bake_manifest.json").read_text()) | |
| purged = [] | |
| for entry in manifest.get("failed_clones", []): | |
| repo = entry["repo"] | |
| cache_dir = base / repo.replace("/", "__") | |
| if cache_dir.exists(): | |
| shutil.rmtree(cache_dir, ignore_errors=True) | |
| purged.append(repo) | |
| print(f"Purged {len(purged)} bake-failed repos: {purged[:10]}{'...' if len(purged) > 10 else ''}") | |
| fc = len(manifest.get("failed_clones", [])) | |
| ff = len(manifest.get("failed_fetches", [])) | |
| print(f"Bake manifest: failed_clones={fc} failed_fetches={ff}") | |
| PY | |
| - name: System info dump | |
| run: | | |
| { | |
| echo "=== uname -a ==="; uname -a | |
| echo; echo "=== /proc/cpuinfo (1 core) ==="; awk '/^processor/{p++} p<2' /proc/cpuinfo | |
| echo; echo "=== /proc/meminfo (head) ==="; head -5 /proc/meminfo | |
| echo; echo "=== df -h ==="; df -h | |
| echo; echo "=== python ==="; python --version | |
| echo; echo "=== pip freeze (head 30) ==="; pip freeze 2>/dev/null | head -30 || true | |
| echo; echo "=== rustc ==="; rustc --version 2>/dev/null || echo "no rustc" | |
| echo; echo "=== cargo ==="; cargo --version 2>/dev/null || echo "no cargo" | |
| echo; echo "=== git ==="; git --version | |
| } > "${CELL_DIR}/system_info.log" 2>&1 || true | |
| - name: Build per-cell single-test-set manifest dir | |
| run: | | |
| mkdir -p /tmp/manifest_one | |
| cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/ | |
| - name: Resolve baseline + scoring | |
| id: cfg | |
| run: | | |
| method="${{ matrix.method }}" | |
| case "$method" in | |
| ppr|ego) | |
| echo "baseline=diffctx" >> "$GITHUB_OUTPUT" | |
| echo "scoring=$method" >> "$GITHUB_OUTPUT" | |
| ;; | |
| bm25) | |
| echo "baseline=bm25" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| aider) | |
| echo "baseline=aider_fair" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| esac | |
| - name: Build winner.json | |
| run: | | |
| cat > /tmp/winner.json <<JSON | |
| { | |
| "winner": { | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF}, | |
| "budget": ${{ matrix.budget }}, | |
| "scoring": "${SCORING}", | |
| "extra_env": {} | |
| } | |
| } | |
| JSON | |
| env: | |
| SCORING: ${{ steps.cfg.outputs.scoring }} | |
| - name: Install uv (required by aider baseline; safe no-op for others) | |
| run: pip install --no-cache-dir uv | |
| - name: Write checkpoint heartbeat script | |
| run: | | |
| cat > /tmp/heartbeat.py << 'PYEOF' | |
| import os, sys, time, json, pathlib, urllib.request | |
| CKPT = pathlib.Path("${{ env.CELL_DIR }}/${{ matrix.test_set }}.checkpoint.jsonl") | |
| ARTIFACT_NAME = "heartbeat-${{ env.CELL_TAG }}" | |
| def try_upload(): | |
| if not CKPT.exists(): | |
| return | |
| rt_url = os.environ.get("ACTIONS_RUNTIME_URL", "") | |
| rt_tok = os.environ.get("ACTIONS_RUNTIME_TOKEN", "") | |
| run_id = os.environ.get("GITHUB_RUN_ID", "") | |
| if not (rt_url and rt_tok and run_id): | |
| return | |
| payload = json.dumps({"type": "actions_storage", "name": ARTIFACT_NAME}).encode() | |
| req = urllib.request.Request( | |
| f"{rt_url}_apis/pipelines/workflows/{run_id}/artifacts?api-version=6.0-preview", | |
| data=payload, | |
| headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/json"}, | |
| method="POST", | |
| ) | |
| try: | |
| with urllib.request.urlopen(req, timeout=30) as r: | |
| resp = json.loads(r.read()) | |
| except Exception as e: | |
| print(f"[heartbeat] create: {e}", file=sys.stderr, flush=True) | |
| return | |
| container_url = resp.get("fileContainerResourceUrl", "") | |
| if not container_url: | |
| return | |
| blob = CKPT.read_bytes() | |
| put_req = urllib.request.Request( | |
| f"{container_url}?itemPath={ARTIFACT_NAME}/checkpoint.jsonl", | |
| data=blob, | |
| headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/octet-stream"}, | |
| method="PUT", | |
| ) | |
| try: | |
| urllib.request.urlopen(put_req, timeout=120) | |
| print(f"[heartbeat] uploaded {len(blob)} bytes", flush=True) | |
| except Exception as e: | |
| print(f"[heartbeat] put: {e}", file=sys.stderr, flush=True) | |
| while True: | |
| time.sleep(600) | |
| try: | |
| try_upload() | |
| except Exception as e: | |
| print(f"[heartbeat] loop: {e}", file=sys.stderr, flush=True) | |
| PYEOF | |
| - name: Run cell (logged to run.log) | |
| timeout-minutes: 330 | |
| shell: bash | |
| run: | | |
| set -o pipefail | |
| python3 /tmp/heartbeat.py >> "${CELL_DIR}/heartbeat.log" 2>&1 & | |
| HEARTBEAT_PID=$! | |
| # Apply EGO graph depth as an env override only when matrix.depth >= 0 | |
| # (the sentinel -1 means "depth doesn't apply to this method"; in that | |
| # case we leave DIFFCTX_OP_GRAPH_DEPTH unset and the default kicks in). | |
| if [ "${MATRIX_DEPTH}" != "-1" ]; then | |
| export DIFFCTX_OP_GRAPH_DEPTH="${MATRIX_DEPTH}" | |
| fi | |
| python -m benchmarks.run_final_eval \ | |
| --baseline "${BASELINE}" \ | |
| --winner /tmp/winner.json \ | |
| --manifests-dir /tmp/manifest_one \ | |
| --workers 10 \ | |
| --timeout-per-instance "${INPUT_TIMEOUT}" \ | |
| --min-memory-gb 32 \ | |
| --min-disk-gb 50 \ | |
| --out "${CELL_DIR}" 2>&1 | tee "${CELL_DIR}/run.log" | |
| kill "${HEARTBEAT_PID}" 2>/dev/null || true | |
| env: | |
| BASELINE: ${{ steps.cfg.outputs.baseline }} | |
| MATRIX_DEPTH: ${{ matrix.depth }} | |
| - name: Per-instance metric summary (always) | |
| if: always() | |
| run: | | |
| ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl" | |
| if [ -f "${ckpt}" ]; then | |
| python - <<PY > "${CELL_DIR}/cell_summary.json" | |
| import json, statistics, sys | |
| rows = [] | |
| with open("${ckpt}") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| rows.append(json.loads(line)) | |
| n = len(rows) | |
| if n == 0: | |
| print(json.dumps({"n": 0})) | |
| else: | |
| recall = [r.get("file_recall", 0) for r in rows] | |
| precision = [r.get("file_precision", 0) for r in rows] | |
| elapsed = [r.get("elapsed_seconds", 0) for r in rows] | |
| tokens = [r.get("used_tokens", 0) for r in rows] | |
| # fragment_recall / line_f1 are populated only when the dataset | |
| # ships gold_fragments (ContextBench, PolyBench when CST nodes | |
| # exist); skip nulls so the aggregator gets honest means rather | |
| # than zero-padded ones. | |
| frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None] | |
| line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None] | |
| ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok") | |
| statuses = {} | |
| errors = {} | |
| for r in rows: | |
| s = (r.get("extra") or {}).get("status", "missing") | |
| statuses[s] = statuses.get(s, 0) + 1 | |
| if s not in ("ok",): | |
| err = str((r.get("extra") or {}).get("error", "")) | |
| if err: | |
| errors[err] = errors.get(err, 0) + 1 | |
| out = { | |
| "n": n, | |
| "ok": ok, | |
| "ok_pct": 100.0 * ok / n if n else 0.0, | |
| "statuses": statuses, | |
| "errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]), | |
| "file_recall": { | |
| "mean": statistics.fmean(recall), | |
| "median": statistics.median(recall), | |
| "p25": sorted(recall)[max(0, n*1//4-1)], | |
| "p75": sorted(recall)[min(n-1, n*3//4)], | |
| }, | |
| "file_precision": {"mean": statistics.fmean(precision)}, | |
| "fragment_recall": ( | |
| {"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)} | |
| if frag_recall else None | |
| ), | |
| "line_f1": ( | |
| {"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)} | |
| if line_f1 else None | |
| ), | |
| "elapsed_seconds": { | |
| "mean": statistics.fmean(elapsed), | |
| "max": max(elapsed), | |
| "total": sum(elapsed), | |
| }, | |
| "used_tokens": { | |
| "mean": statistics.fmean(tokens), | |
| "max": max(tokens), | |
| }, | |
| } | |
| print(json.dumps(out, indent=2)) | |
| if errors: | |
| print("\n=== ERROR BREAKDOWN ===", file=sys.stderr) | |
| for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]): | |
| print(f" [{cnt}x] {msg[:200]}", file=sys.stderr) | |
| PY | |
| cat "${CELL_DIR}/cell_summary.json" | |
| else | |
| printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \ | |
| > "${CELL_DIR}/cell_summary.json" | |
| ls -la "${CELL_DIR}" || true | |
| fi | |
| - name: Append cell-end metadata | |
| if: always() | |
| run: | | |
| end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" | |
| if command -v jq >/dev/null 2>&1; then | |
| tmp=$(mktemp) | |
| jq --arg end "${end_ts}" --arg status "${{ job.status }}" --arg disk "$(df -h / | tail -1)" \ | |
| '. + {ended_at_utc: $end, exit_status: $status, runner_disk_at_end: $disk}' \ | |
| "${CELL_DIR}/metadata.json" > "${tmp}" \ | |
| && mv "${tmp}" "${CELL_DIR}/metadata.json" | |
| fi | |
| cat "${CELL_DIR}/metadata.json" | |
| - name: Upload cell artifact (90-day retention) | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: cell-${{ matrix.method }}-b${{ matrix.budget }}-L${{ matrix.depth }}-${{ matrix.test_set }} | |
| path: ${{ env.CELL_DIR }}/ | |
| retention-days: 90 | |
| if-no-files-found: warn | |
| - name: Reclaim disk — drop this cell's per-worker worktrees | |
| if: always() | |
| shell: bash | |
| run: | | |
| set -e | |
| base=/cache/contextbench_repos | |
| runner_slug="$(printf '%s' "${RUNNER_NAME:-unknown}" | tr ' -' '__')" | |
| # Defense-in-depth: clean up at the END of each cell so a crash | |
| # on the NEXT cell's preflight doesn't leave the runner ENOSPC'd. | |
| # Scoped to this runner only — the other 3 self-hosted runners | |
| # on this CCX63 share the disk and may have live worktrees. | |
| if [ -d "${base}/worktrees" ]; then | |
| find "${base}/worktrees" -mindepth 1 -maxdepth 1 -type d \ | |
| -name "w${runner_slug}_*" -prune -exec rm -rf {} + 2>/dev/null || true | |
| fi | |
| for repo_cache in "${base}"/*/; do | |
| if [ -d "${repo_cache}/.git" ] || [ -f "${repo_cache}/HEAD" ]; then | |
| git -C "${repo_cache}" worktree prune --expire=now 2>/dev/null || true | |
| fi | |
| done | |
| df -h "${base}" || true | |
| # ============================================================================ | |
| # 4. Aggregate all cells into per-cell summary table + raw artifact archive. | |
| # ============================================================================ | |
| aggregate: | |
| needs: [prep, sweep-smoke, sweep-full] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: write | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - uses: actions/setup-python@v6 | |
| with: | |
| python-version: '3.12' | |
| - name: Install deps | |
| run: pip install numpy scipy | |
| - name: Download all cell artifacts | |
| uses: actions/download-artifact@v8 | |
| with: | |
| path: all_cells | |
| pattern: cell-* | |
| - name: Aggregate per-cell summaries | |
| run: | | |
| mkdir -p aggregated | |
| python -m benchmarks.aggregate_sweep \ | |
| --cells-dir all_cells \ | |
| --sweep-id "${{ needs.prep.outputs.sweep_id }}" \ | |
| --out aggregated/ | |
| - name: Show aggregate summary | |
| run: | | |
| echo "=== aggregated/ ===" | |
| ls -la aggregated/ | |
| echo | |
| echo "=== aggregated/SWEEP_TABLE.md ===" | |
| cat aggregated/SWEEP_TABLE.md || true | |
| echo | |
| echo "=== aggregated/grand_summary.json (head 80) ===" | |
| head -80 aggregated/grand_summary.json || true | |
| - name: Upload aggregated tables (1-year retention) | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: sweep-aggregated-${{ needs.prep.outputs.sweep_id }} | |
| path: aggregated/ | |
| retention-days: 365 | |
| - name: Commit raw artifacts to results branch | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| BR: bench-results/sweep | |
| REPO: ${{ github.repository }} | |
| SWEEP_ID: ${{ needs.prep.outputs.sweep_id }} | |
| TS: ${{ needs.prep.outputs.ts }} | |
| INPUT_TAU: '0.12' | |
| INPUT_CBF: '0.5' | |
| INPUT_TIMEOUT: '600' | |
| INPUT_MANIFESTS_SUBDIR: v1 | |
| run: | | |
| git config user.name 'github-actions[bot]' | |
| git config user.email 'github-actions[bot]@users.noreply.github.com' | |
| git clone --depth 1 --branch main \ | |
| "https://x-access-token:${GITHUB_TOKEN}@github.com/${REPO}.git" \ | |
| sidecar | |
| cd sidecar | |
| if git ls-remote --heads origin "${BR}" | grep -q "${BR}"; then | |
| git fetch origin "${BR}":"${BR}" | |
| git checkout "${BR}" | |
| else | |
| git checkout --orphan "${BR}" | |
| git rm -rf . 2>/dev/null || true | |
| fi | |
| mkdir -p "${SWEEP_ID}" | |
| cp -r ../aggregated "${SWEEP_ID}/aggregated" | |
| cp -r ../all_cells "${SWEEP_ID}/cells" | |
| cat > "${SWEEP_ID}/README.md" <<EOF | |
| # Sweep ${SWEEP_ID} | |
| - Triggered: ${TS} | |
| - Source SHA: ${GITHUB_SHA} | |
| - Manifests: ${INPUT_MANIFESTS_SUBDIR} | |
| - tau=${INPUT_TAU}, core_budget_fraction=${INPUT_CBF} | |
| - Per-instance timeout: ${INPUT_TIMEOUT}s | |
| See \`aggregated/SWEEP_TABLE.md\` and \`cells/\` for raw per-instance JSONL checkpoints. | |
| EOF | |
| git add -A | |
| git commit -m "sweep ${SWEEP_ID} (source ${GITHUB_SHA})" || echo "nothing to commit" | |
| for attempt in 1 2 3; do | |
| echo "Push attempt ${attempt}/3..." | |
| git pull --rebase origin "${BR}" 2>/dev/null || true | |
| if git push -u origin "${BR}"; then | |
| break | |
| else | |
| sleep 10 | |
| fi | |
| done | |
| # ============================================================================ | |
| # 5. Deregister runners and destroy Hetzner server — full mode only | |
| # ============================================================================ | |
| cleanup: | |
| needs: [sweep-full] | |
| if: always() && inputs.mode == 'full' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - name: Deregister self-hosted runners | |
| env: | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| REPO: ${{ github.repository }} | |
| SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| run: | | |
| set -euo pipefail | |
| RUNNER_IDS=$(curl -sf \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \ | |
| | jq --arg pfx "${SERVER_NAME}" \ | |
| '[.runners[] | select(.name | startswith($pfx)) | .id]') | |
| echo "Runner IDs to remove: ${RUNNER_IDS}" | |
| echo "${RUNNER_IDS}" | jq -r '.[]' | while read -r rid; do | |
| echo " Deleting runner ${rid}..." | |
| curl -sf -X DELETE \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners/${rid}" || true | |
| done | |
| - name: Download server_id artifact | |
| uses: actions/download-artifact@v8 | |
| with: | |
| name: hetzner-server-id | |
| path: /tmp/hetzner | |
| - name: Delete Hetzner server | |
| env: | |
| HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} | |
| run: | | |
| set -euo pipefail | |
| SERVER_ID=$(cat /tmp/hetzner/server_id.txt) | |
| if [ -z "${SERVER_ID}" ] || [ "${SERVER_ID}" = "null" ]; then | |
| echo "No server ID found — nothing to delete." | |
| exit 0 | |
| fi | |
| echo "Deleting Hetzner server ${SERVER_ID}..." | |
| curl -sf -X DELETE "https://api.hetzner.cloud/v1/servers/${SERVER_ID}" \ | |
| -H "Authorization: Bearer ${HCLOUD_TOKEN}" || true | |
| echo "Server ${SERVER_ID} deletion requested." |