bench-sweep #44
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: bench-sweep | |
| # Multi-runner parallel sweep across method × budget × test_set. | |
| # | |
| # Two modes (select at dispatch): | |
| # smoke — GitHub-hosted ubuntu-latest, 1 cell (ppr×64000×swebench_verified), | |
| # 5 instances, no Hetzner provisioning. Quick sanity check. | |
| # full — Hetzner CCX63 (4 × self-hosted runners), complete 5×5×3 matrix, | |
| # baked repo cache. Production sweep. | |
| # | |
| # Trigger via workflow_dispatch. Results are committed to bench-results/sweep. | |
| 'on': | |
| workflow_dispatch: | |
| inputs: | |
| mode: | |
| description: 'smoke = GH Actions, 5 instances | full = Hetzner CCX63, complete sweep' | |
| required: true | |
| default: smoke | |
| type: choice | |
| options: [smoke, full] | |
| concurrency: | |
| group: bench-sweep | |
| cancel-in-progress: false | |
| env: | |
| REGISTRY: ghcr.io | |
| IMAGE_REF: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest | |
| MANIFESTS_SUBDIR: v1 | |
| TIMEOUT_PER_INSTANCE: '600' | |
| TAU: '0.12' | |
| CORE_BUDGET_FRACTION: '0.5' | |
| RESULTS_BRANCH: bench-results/sweep | |
| SERVER_TAG: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| jobs: | |
| # ============================================================================ | |
| # 1. Compute a stable sweep_id used as the directory name for raw artifacts | |
| # ============================================================================ | |
| prep: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| outputs: | |
| sweep_id: ${{ steps.id.outputs.sweep_id }} | |
| ts: ${{ steps.id.outputs.ts }} | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - id: id | |
| run: | | |
| ts=$(date -u +%Y%m%dT%H%M%SZ) | |
| sha=$(git rev-parse --short HEAD) | |
| echo "sweep_id=sweep-${ts}-${sha}" >> "$GITHUB_OUTPUT" | |
| echo "ts=${ts}" >> "$GITHUB_OUTPUT" | |
| # ============================================================================ | |
| # 2. Provision Hetzner CCX63 — full mode only | |
| # ============================================================================ | |
| provision: | |
| needs: prep | |
| if: inputs.mode == 'full' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| outputs: | |
| server_id: ${{ steps.create.outputs.server_id }} | |
| server_ip: ${{ steps.create.outputs.server_ip }} | |
| steps: | |
| - name: Create Hetzner CCX63 with 4 self-hosted runners | |
| id: create | |
| env: | |
| HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| REPO: ${{ github.repository }} | |
| SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| BENCH_SSH_PUBLIC_KEY: ${{ secrets.BENCH_SSH_PUBLIC_KEY }} | |
| run: | | |
| set -euo pipefail | |
| REG_TOKEN=$(curl -sf -X POST \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners/registration-token" \ | |
| | jq -r '.token') | |
| RUNNER_VER=$(curl -sf \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/actions/runner/releases/latest" \ | |
| | jq -r '.tag_name[1:]') | |
| REPO_URL="https://github.com/${REPO}" | |
| cat > /tmp/cloud-init.sh << ENDINIT | |
| #!/bin/bash | |
| set -euo pipefail | |
| export DEBIAN_FRONTEND=noninteractive | |
| apt-get update -y | |
| apt-get install -y docker.io curl jq | |
| systemctl enable docker && systemctl start docker | |
| mkdir -p /root/.ssh && chmod 700 /root/.ssh | |
| echo "${BENCH_SSH_PUBLIC_KEY}" >> /root/.ssh/authorized_keys | |
| chmod 600 /root/.ssh/authorized_keys | |
| chage -M -1 root | |
| useradd -m -s /bin/bash github | |
| usermod -aG docker github | |
| echo "${GH_TOKEN}" | docker login ghcr.io -u nikolay-e --password-stdin | |
| docker pull ghcr.io/nikolay-e/treemapper-bench:latest | |
| nohup bash -c ' | |
| mkdir -p /data/bench_repos | |
| BAKE_LOG=/data/bench_repos/bake.log | |
| if docker run --rm \ | |
| --entrypoint python3 \ | |
| -v /data/bench_repos:/cache/contextbench_repos \ | |
| -e BAKE_PARALLELISM=16 \ | |
| ghcr.io/nikolay-e/treemapper-bench:latest \ | |
| /app/scripts/bake_bench_cache.py /cache/contextbench_repos \ | |
| >> "\${BAKE_LOG}" 2>&1; then | |
| touch /data/bench_repos/.ready | |
| echo "[bake] OK at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}" | |
| else | |
| rc=\$? | |
| touch /data/bench_repos/.bake_failed | |
| echo "[bake] FAILED rc=\${rc} at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}" | |
| fi | |
| ' & | |
| curl -sL -o /tmp/runner.tar.gz \ | |
| "https://github.com/actions/runner/releases/download/v${RUNNER_VER}/actions-runner-linux-x64-${RUNNER_VER}.tar.gz" | |
| for N in 1 2 3 4; do | |
| mkdir -p "/home/github/runner-\${N}" | |
| tar xzf /tmp/runner.tar.gz -C "/home/github/runner-\${N}" | |
| chown -R github:github "/home/github/runner-\${N}" | |
| sudo -u github bash -c " | |
| cd /home/github/runner-\${N} && | |
| ./config.sh \ | |
| --url '${REPO_URL}' \ | |
| --token '${REG_TOKEN}' \ | |
| --name '${SERVER_NAME}-\${N}' \ | |
| --labels 'self-hosted,large' \ | |
| --unattended \ | |
| --replace" | |
| nohup sudo -u github bash -c \ | |
| "cd /home/github/runner-\${N} && ./run.sh" \ | |
| >> "/home/github/runner-\${N}/runner.log" 2>&1 & | |
| done | |
| ENDINIT | |
| RESPONSE=$(curl -sf -X POST "https://api.hetzner.cloud/v1/servers" \ | |
| -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ | |
| -H "Content-Type: application/json" \ | |
| -d "{ | |
| \"name\": \"${SERVER_NAME}\", | |
| \"server_type\": \"ccx63\", | |
| \"image\": \"ubuntu-24.04\", | |
| \"location\": \"nbg1\", | |
| \"user_data\": $(python3 -c "import json,sys; print(json.dumps(open('/tmp/cloud-init.sh').read()))") | |
| }") | |
| SERVER_ID=$(echo "${RESPONSE}" | jq -r '.server.id') | |
| SERVER_IP=$(echo "${RESPONSE}" | jq -r '.server.public_net.ipv4.ip') | |
| echo "server_id=${SERVER_ID}" >> "${GITHUB_OUTPUT}" | |
| echo "server_ip=${SERVER_IP}" >> "${GITHUB_OUTPUT}" | |
| echo "${SERVER_ID}" > server_id.txt | |
| echo "Provisioned Hetzner CCX63 server_id=${SERVER_ID} ip=${SERVER_IP} name=${SERVER_NAME}" | |
| SSHCMD="ssh root@${SERVER_IP} -i <(security find-generic-password -s treemapper-bench-ssh-key -a \$USER -w)" | |
| echo "SSH: ${SSHCMD}" | |
| - name: Upload server_id for cleanup workflow | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: hetzner-server-id | |
| path: server_id.txt | |
| retention-days: 7 | |
| - name: Wait for all 4 runners to come online | |
| env: | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| REPO: ${{ github.repository }} | |
| SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| run: | | |
| set -euo pipefail | |
| echo "Polling for 4 runners prefixed '${SERVER_NAME}'..." | |
| DEADLINE=$((SECONDS + 900)) | |
| while [[ ${SECONDS} -lt ${DEADLINE} ]]; do | |
| ONLINE=$(curl -sf \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \ | |
| | jq --arg pfx "${SERVER_NAME}" \ | |
| '[.runners[] | select(.name | startswith($pfx)) | select(.status == "online")] | length') | |
| echo " Online runners: ${ONLINE}/4" | |
| [[ "${ONLINE}" -ge 4 ]] && echo "All 4 runners ready." && exit 0 | |
| sleep 30 | |
| done | |
| echo "ERROR: runners did not come online within 15 minutes." >&2 | |
| exit 1 | |
| # ============================================================================ | |
| # 3a. Smoke sweep — ubuntu-latest, minimal matrix, 5 instances, no bake cache | |
| # ============================================================================ | |
| sweep-smoke: | |
| needs: [prep] | |
| if: inputs.mode == 'smoke' | |
| permissions: | |
| contents: read | |
| packages: read | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| method: [ppr, ego, bm25, aider] | |
| budget: [64000] | |
| test_set: [swebench_verified] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| container: | |
| image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest | |
| credentials: | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| options: >- | |
| --cpus 2 | |
| --memory 6g | |
| --memory-swap 6g | |
| env: | |
| CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }} | |
| CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }} | |
| MANIFESTS_DIR: benchmarks/manifests/v1 | |
| INPUT_TAU: '0.12' | |
| INPUT_CBF: '0.5' | |
| INPUT_TIMEOUT: '120' | |
| INPUT_MANIFESTS_SUBDIR: v1 | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Cell start metadata | |
| run: | | |
| mkdir -p "${CELL_DIR}" | |
| cat > "${CELL_DIR}/metadata.json" <<JSON | |
| { | |
| "sweep_id": "${{ needs.prep.outputs.sweep_id }}", | |
| "mode": "smoke", | |
| "cell": { | |
| "method": "${{ matrix.method }}", | |
| "budget": ${{ matrix.budget }}, | |
| "test_set": "${{ matrix.test_set }}", | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF} | |
| }, | |
| "git": { | |
| "sha": "${{ github.sha }}", | |
| "ref": "${{ github.ref }}", | |
| "actor": "${{ github.actor }}" | |
| }, | |
| "started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "runner": { | |
| "os": "${RUNNER_OS}", | |
| "arch": "${RUNNER_ARCH}", | |
| "name": "${RUNNER_NAME}", | |
| "image": "${IMAGE_REF}" | |
| }, | |
| "stage": "started" | |
| } | |
| JSON | |
| - name: Build per-cell manifest dir | |
| run: | | |
| mkdir -p /tmp/manifest_one | |
| cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/ | |
| - name: Resolve baseline + scoring | |
| id: cfg | |
| run: | | |
| method="${{ matrix.method }}" | |
| case "$method" in | |
| ppr|ego) | |
| echo "baseline=diffctx" >> "$GITHUB_OUTPUT" | |
| echo "scoring=$method" >> "$GITHUB_OUTPUT" | |
| ;; | |
| bm25) | |
| echo "baseline=bm25" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| aider) | |
| echo "baseline=aider_fair" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| esac | |
| - name: Build winner.json | |
| run: | | |
| cat > /tmp/winner.json <<JSON | |
| { | |
| "winner": { | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF}, | |
| "budget": ${{ matrix.budget }}, | |
| "scoring": "${SCORING}", | |
| "extra_env": {} | |
| } | |
| } | |
| JSON | |
| env: | |
| SCORING: ${{ steps.cfg.outputs.scoring }} | |
| - name: Install uv (required by aider baseline) | |
| run: pip install uv | |
| - name: Run cell (smoke — 5 instances) | |
| timeout-minutes: 50 | |
| shell: bash | |
| run: | | |
| set -o pipefail | |
| python -m benchmarks.run_final_eval \ | |
| --baseline "${BASELINE}" \ | |
| --winner /tmp/winner.json \ | |
| --manifests-dir /tmp/manifest_one \ | |
| --workers 1 \ | |
| --limit 5 \ | |
| --timeout-per-instance "${INPUT_TIMEOUT}" \ | |
| --min-memory-gb 2 \ | |
| --min-disk-gb 5 \ | |
| --out "${CELL_DIR}" 2>&1 | tee "${CELL_DIR}/run.log" | |
| env: | |
| BASELINE: ${{ steps.cfg.outputs.baseline }} | |
| - name: Per-instance metric summary (always) | |
| if: always() | |
| run: | | |
| ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl" | |
| if [ -f "${ckpt}" ]; then | |
| python - <<PY > "${CELL_DIR}/cell_summary.json" | |
| import json, statistics, sys | |
| rows = [] | |
| with open("${ckpt}") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| rows.append(json.loads(line)) | |
| n = len(rows) | |
| if n == 0: | |
| print(json.dumps({"n": 0})) | |
| else: | |
| recall = [r.get("file_recall", 0) for r in rows] | |
| precision = [r.get("file_precision", 0) for r in rows] | |
| elapsed = [r.get("elapsed_seconds", 0) for r in rows] | |
| tokens = [r.get("used_tokens", 0) for r in rows] | |
| # fragment_recall / line_f1 are populated only when the dataset | |
| # ships gold_fragments (ContextBench, PolyBench when CST nodes | |
| # exist); skip nulls so the aggregator gets honest means rather | |
| # than zero-padded ones. | |
| frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None] | |
| line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None] | |
| ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok") | |
| statuses = {} | |
| errors = {} | |
| for r in rows: | |
| s = (r.get("extra") or {}).get("status", "missing") | |
| statuses[s] = statuses.get(s, 0) + 1 | |
| if s not in ("ok",): | |
| err = str((r.get("extra") or {}).get("error", "")) | |
| if err: | |
| errors[err] = errors.get(err, 0) + 1 | |
| out = { | |
| "n": n, | |
| "ok": ok, | |
| "ok_pct": 100.0 * ok / n if n else 0.0, | |
| "statuses": statuses, | |
| "errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]), | |
| "file_recall": { | |
| "mean": statistics.fmean(recall), | |
| "median": statistics.median(recall), | |
| }, | |
| "file_precision": {"mean": statistics.fmean(precision)}, | |
| "fragment_recall": ( | |
| {"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)} | |
| if frag_recall else None | |
| ), | |
| "line_f1": ( | |
| {"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)} | |
| if line_f1 else None | |
| ), | |
| "elapsed_seconds": {"mean": statistics.fmean(elapsed), "total": sum(elapsed)}, | |
| "used_tokens": {"mean": statistics.fmean(tokens)}, | |
| } | |
| print(json.dumps(out, indent=2)) | |
| if errors: | |
| print("\n=== ERROR BREAKDOWN ===", file=sys.stderr) | |
| for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]): | |
| print(f" [{cnt}x] {msg[:200]}", file=sys.stderr) | |
| PY | |
| cat "${CELL_DIR}/cell_summary.json" | |
| else | |
| printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \ | |
| > "${CELL_DIR}/cell_summary.json" | |
| fi | |
| - name: Append cell-end metadata | |
| if: always() | |
| run: | | |
| end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" | |
| if command -v jq >/dev/null 2>&1; then | |
| tmp=$(mktemp) | |
| jq --arg end "${end_ts}" --arg status "${{ job.status }}" \ | |
| '. + {ended_at_utc: $end, exit_status: $status}' \ | |
| "${CELL_DIR}/metadata.json" > "${tmp}" \ | |
| && mv "${tmp}" "${CELL_DIR}/metadata.json" | |
| fi | |
| - name: Upload cell artifact | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: cell-${{ matrix.method }}-b${{ matrix.budget }}-${{ matrix.test_set }} | |
| path: ${{ env.CELL_DIR }}/ | |
| retention-days: 30 | |
| if-no-files-found: warn | |
| # ============================================================================ | |
| # 3b. Full sweep — Hetzner CCX63, complete 5×5×3 matrix, baked repo cache | |
| # ============================================================================ | |
| sweep-full: | |
| needs: [prep, provision] | |
| if: inputs.mode == 'full' | |
| permissions: | |
| contents: read | |
| packages: read | |
| strategy: | |
| fail-fast: false | |
| # 4 self-hosted runners on the CCX63 host; the queue can only drain at | |
| # that rate anyway, so cap the matrix-level parallelism to match. | |
| max-parallel: 4 | |
| matrix: | |
| method: [ppr, ego, bm25, aider] | |
| # B=0 is a recall-floor sanity bound (must report ≈ 0); B=-1 is the | |
| # unlimited-budget ceiling. The 5 paying budgets {8k, 16k, 32k, 64k, 128k} | |
| # form the budget curve referenced in STATS_PLAN.md. | |
| budget: [-1, 0, 8000, 16000, 32000, 64000, 128000] | |
| test_set: [contextbench_verified, polybench500, swebench_verified] | |
| exclude: | |
| - {method: bm25, budget: -1} | |
| - {method: aider, budget: -1} | |
| runs-on: [self-hosted, large] | |
| timeout-minutes: 350 | |
| container: | |
| image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest | |
| credentials: | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| options: >- | |
| --cpus 12 | |
| --memory 44g | |
| --memory-swap 44g | |
| --shm-size 8g | |
| --pids-limit -1 | |
| --ulimit nofile=1048576:1048576 | |
| --ulimit nproc=65535:65535 | |
| -v /data/bench_repos:/cache/contextbench_repos | |
| env: | |
| CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }} | |
| CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }} | |
| MANIFESTS_DIR: benchmarks/manifests/v1 | |
| INPUT_TAU: '0.12' | |
| INPUT_CBF: '0.5' | |
| INPUT_TIMEOUT: '600' | |
| INPUT_MANIFESTS_SUBDIR: v1 | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Cell start metadata (write FIRST, before any failure point) | |
| run: | | |
| mkdir -p "${CELL_DIR}" | |
| cat > "${CELL_DIR}/metadata.json" <<JSON | |
| { | |
| "sweep_id": "${{ needs.prep.outputs.sweep_id }}", | |
| "mode": "full", | |
| "cell": { | |
| "method": "${{ matrix.method }}", | |
| "budget": ${{ matrix.budget }}, | |
| "test_set": "${{ matrix.test_set }}", | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF} | |
| }, | |
| "git": { | |
| "sha": "${{ github.sha }}", | |
| "ref": "${{ github.ref }}", | |
| "actor": "${{ github.actor }}" | |
| }, | |
| "started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "runner": { | |
| "os": "${RUNNER_OS}", | |
| "arch": "${RUNNER_ARCH}", | |
| "name": "${RUNNER_NAME}", | |
| "image": "${IMAGE_REF}" | |
| }, | |
| "stage": "started" | |
| } | |
| JSON | |
| - name: Wait for shared repo cache | |
| run: | | |
| while true; do | |
| if [ -f /cache/contextbench_repos/.bake_failed ]; then | |
| echo "=== BAKE FAILED — full log ===" | |
| cat /cache/contextbench_repos/bake.log || echo "(bake.log not found)" | |
| echo "=== end of bake.log ===" | |
| exit 1 | |
| fi | |
| if [ -f /cache/contextbench_repos/.ready ]; then | |
| echo "Repo cache ready." | |
| break | |
| fi | |
| echo "Waiting for repo cache extraction to complete..." | |
| sleep 15 | |
| done | |
| - name: Validate bake manifest and purge known-bad repos | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| MANIFEST=/cache/contextbench_repos/.bake_manifest.json | |
| if [ ! -f "${MANIFEST}" ]; then | |
| echo "WARN: no .bake_manifest.json found — proceeding without preflight purge" | |
| exit 0 | |
| fi | |
| python3 - <<'PY' | |
| import json, shutil | |
| from pathlib import Path | |
| base = Path("/cache/contextbench_repos") | |
| manifest = json.loads((base / ".bake_manifest.json").read_text()) | |
| purged = [] | |
| for entry in manifest.get("failed_clones", []): | |
| repo = entry["repo"] | |
| cache_dir = base / repo.replace("/", "__") | |
| if cache_dir.exists(): | |
| shutil.rmtree(cache_dir, ignore_errors=True) | |
| purged.append(repo) | |
| print(f"Purged {len(purged)} bake-failed repos: {purged[:10]}{'...' if len(purged) > 10 else ''}") | |
| fc = len(manifest.get("failed_clones", [])) | |
| ff = len(manifest.get("failed_fetches", [])) | |
| print(f"Bake manifest: failed_clones={fc} failed_fetches={ff}") | |
| PY | |
| - name: System info dump | |
| run: | | |
| { | |
| echo "=== uname -a ==="; uname -a | |
| echo; echo "=== /proc/cpuinfo (1 core) ==="; awk '/^processor/{p++} p<2' /proc/cpuinfo | |
| echo; echo "=== /proc/meminfo (head) ==="; head -5 /proc/meminfo | |
| echo; echo "=== df -h ==="; df -h | |
| echo; echo "=== python ==="; python --version | |
| echo; echo "=== pip freeze (head 30) ==="; pip freeze 2>/dev/null | head -30 || true | |
| echo; echo "=== rustc ==="; rustc --version 2>/dev/null || echo "no rustc" | |
| echo; echo "=== cargo ==="; cargo --version 2>/dev/null || echo "no cargo" | |
| echo; echo "=== git ==="; git --version | |
| } > "${CELL_DIR}/system_info.log" 2>&1 || true | |
| - name: Build per-cell single-test-set manifest dir | |
| run: | | |
| mkdir -p /tmp/manifest_one | |
| cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/ | |
| - name: Resolve baseline + scoring | |
| id: cfg | |
| run: | | |
| method="${{ matrix.method }}" | |
| case "$method" in | |
| ppr|ego) | |
| echo "baseline=diffctx" >> "$GITHUB_OUTPUT" | |
| echo "scoring=$method" >> "$GITHUB_OUTPUT" | |
| ;; | |
| bm25) | |
| echo "baseline=bm25" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| aider) | |
| echo "baseline=aider_fair" >> "$GITHUB_OUTPUT" | |
| echo "scoring=ego" >> "$GITHUB_OUTPUT" | |
| ;; | |
| esac | |
| - name: Build winner.json | |
| run: | | |
| cat > /tmp/winner.json <<JSON | |
| { | |
| "winner": { | |
| "tau": ${INPUT_TAU}, | |
| "core_budget_fraction": ${INPUT_CBF}, | |
| "budget": ${{ matrix.budget }}, | |
| "scoring": "${SCORING}", | |
| "extra_env": {} | |
| } | |
| } | |
| JSON | |
| env: | |
| SCORING: ${{ steps.cfg.outputs.scoring }} | |
| - name: Install uv (required by aider baseline; safe no-op for others) | |
| run: pip install --no-cache-dir uv | |
| - name: Write checkpoint heartbeat script | |
| run: | | |
| cat > /tmp/heartbeat.py << 'PYEOF' | |
| import os, sys, time, json, pathlib, urllib.request | |
| CKPT = pathlib.Path("${{ env.CELL_DIR }}/${{ matrix.test_set }}.checkpoint.jsonl") | |
| ARTIFACT_NAME = "heartbeat-${{ env.CELL_TAG }}" | |
| def try_upload(): | |
| if not CKPT.exists(): | |
| return | |
| rt_url = os.environ.get("ACTIONS_RUNTIME_URL", "") | |
| rt_tok = os.environ.get("ACTIONS_RUNTIME_TOKEN", "") | |
| run_id = os.environ.get("GITHUB_RUN_ID", "") | |
| if not (rt_url and rt_tok and run_id): | |
| return | |
| payload = json.dumps({"type": "actions_storage", "name": ARTIFACT_NAME}).encode() | |
| req = urllib.request.Request( | |
| f"{rt_url}_apis/pipelines/workflows/{run_id}/artifacts?api-version=6.0-preview", | |
| data=payload, | |
| headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/json"}, | |
| method="POST", | |
| ) | |
| try: | |
| with urllib.request.urlopen(req, timeout=30) as r: | |
| resp = json.loads(r.read()) | |
| except Exception as e: | |
| print(f"[heartbeat] create: {e}", file=sys.stderr, flush=True) | |
| return | |
| container_url = resp.get("fileContainerResourceUrl", "") | |
| if not container_url: | |
| return | |
| blob = CKPT.read_bytes() | |
| put_req = urllib.request.Request( | |
| f"{container_url}?itemPath={ARTIFACT_NAME}/checkpoint.jsonl", | |
| data=blob, | |
| headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/octet-stream"}, | |
| method="PUT", | |
| ) | |
| try: | |
| urllib.request.urlopen(put_req, timeout=120) | |
| print(f"[heartbeat] uploaded {len(blob)} bytes", flush=True) | |
| except Exception as e: | |
| print(f"[heartbeat] put: {e}", file=sys.stderr, flush=True) | |
| while True: | |
| time.sleep(600) | |
| try: | |
| try_upload() | |
| except Exception as e: | |
| print(f"[heartbeat] loop: {e}", file=sys.stderr, flush=True) | |
| PYEOF | |
| - name: Run cell (logged to run.log) | |
| timeout-minutes: 330 | |
| shell: bash | |
| run: | | |
| set -o pipefail | |
| python3 /tmp/heartbeat.py >> "${CELL_DIR}/heartbeat.log" 2>&1 & | |
| HEARTBEAT_PID=$! | |
| python -m benchmarks.run_final_eval \ | |
| --baseline "${BASELINE}" \ | |
| --winner /tmp/winner.json \ | |
| --manifests-dir /tmp/manifest_one \ | |
| --workers 10 \ | |
| --timeout-per-instance "${INPUT_TIMEOUT}" \ | |
| --min-memory-gb 32 \ | |
| --min-disk-gb 50 \ | |
| --out "${CELL_DIR}" 2>&1 | tee "${CELL_DIR}/run.log" | |
| kill "${HEARTBEAT_PID}" 2>/dev/null || true | |
| env: | |
| BASELINE: ${{ steps.cfg.outputs.baseline }} | |
| - name: Per-instance metric summary (always) | |
| if: always() | |
| run: | | |
| ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl" | |
| if [ -f "${ckpt}" ]; then | |
| python - <<PY > "${CELL_DIR}/cell_summary.json" | |
| import json, statistics, sys | |
| rows = [] | |
| with open("${ckpt}") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| rows.append(json.loads(line)) | |
| n = len(rows) | |
| if n == 0: | |
| print(json.dumps({"n": 0})) | |
| else: | |
| recall = [r.get("file_recall", 0) for r in rows] | |
| precision = [r.get("file_precision", 0) for r in rows] | |
| elapsed = [r.get("elapsed_seconds", 0) for r in rows] | |
| tokens = [r.get("used_tokens", 0) for r in rows] | |
| # fragment_recall / line_f1 are populated only when the dataset | |
| # ships gold_fragments (ContextBench, PolyBench when CST nodes | |
| # exist); skip nulls so the aggregator gets honest means rather | |
| # than zero-padded ones. | |
| frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None] | |
| line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None] | |
| ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok") | |
| statuses = {} | |
| errors = {} | |
| for r in rows: | |
| s = (r.get("extra") or {}).get("status", "missing") | |
| statuses[s] = statuses.get(s, 0) + 1 | |
| if s not in ("ok",): | |
| err = str((r.get("extra") or {}).get("error", "")) | |
| if err: | |
| errors[err] = errors.get(err, 0) + 1 | |
| out = { | |
| "n": n, | |
| "ok": ok, | |
| "ok_pct": 100.0 * ok / n if n else 0.0, | |
| "statuses": statuses, | |
| "errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]), | |
| "file_recall": { | |
| "mean": statistics.fmean(recall), | |
| "median": statistics.median(recall), | |
| "p25": sorted(recall)[max(0, n*1//4-1)], | |
| "p75": sorted(recall)[min(n-1, n*3//4)], | |
| }, | |
| "file_precision": {"mean": statistics.fmean(precision)}, | |
| "fragment_recall": ( | |
| {"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)} | |
| if frag_recall else None | |
| ), | |
| "line_f1": ( | |
| {"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)} | |
| if line_f1 else None | |
| ), | |
| "elapsed_seconds": { | |
| "mean": statistics.fmean(elapsed), | |
| "max": max(elapsed), | |
| "total": sum(elapsed), | |
| }, | |
| "used_tokens": { | |
| "mean": statistics.fmean(tokens), | |
| "max": max(tokens), | |
| }, | |
| } | |
| print(json.dumps(out, indent=2)) | |
| if errors: | |
| print("\n=== ERROR BREAKDOWN ===", file=sys.stderr) | |
| for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]): | |
| print(f" [{cnt}x] {msg[:200]}", file=sys.stderr) | |
| PY | |
| cat "${CELL_DIR}/cell_summary.json" | |
| else | |
| printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \ | |
| > "${CELL_DIR}/cell_summary.json" | |
| ls -la "${CELL_DIR}" || true | |
| fi | |
| - name: Append cell-end metadata | |
| if: always() | |
| run: | | |
| end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" | |
| if command -v jq >/dev/null 2>&1; then | |
| tmp=$(mktemp) | |
| jq --arg end "${end_ts}" --arg status "${{ job.status }}" --arg disk "$(df -h / | tail -1)" \ | |
| '. + {ended_at_utc: $end, exit_status: $status, runner_disk_at_end: $disk}' \ | |
| "${CELL_DIR}/metadata.json" > "${tmp}" \ | |
| && mv "${tmp}" "${CELL_DIR}/metadata.json" | |
| fi | |
| cat "${CELL_DIR}/metadata.json" | |
| - name: Upload cell artifact (90-day retention) | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: cell-${{ matrix.method }}-b${{ matrix.budget }}-${{ matrix.test_set }} | |
| path: ${{ env.CELL_DIR }}/ | |
| retention-days: 90 | |
| if-no-files-found: warn | |
| # ============================================================================ | |
| # 4. Aggregate all cells into per-cell summary table + raw artifact archive. | |
| # ============================================================================ | |
| aggregate: | |
| needs: [prep, sweep-smoke, sweep-full] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: write | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - uses: actions/setup-python@v6 | |
| with: | |
| python-version: '3.12' | |
| - name: Install deps | |
| run: pip install numpy scipy | |
| - name: Download all cell artifacts | |
| uses: actions/download-artifact@v8 | |
| with: | |
| path: all_cells | |
| pattern: cell-* | |
| - name: Aggregate per-cell summaries | |
| run: | | |
| mkdir -p aggregated | |
| python -m benchmarks.aggregate_sweep \ | |
| --cells-dir all_cells \ | |
| --sweep-id "${{ needs.prep.outputs.sweep_id }}" \ | |
| --out aggregated/ | |
| - name: Show aggregate summary | |
| run: | | |
| echo "=== aggregated/ ===" | |
| ls -la aggregated/ | |
| echo | |
| echo "=== aggregated/SWEEP_TABLE.md ===" | |
| cat aggregated/SWEEP_TABLE.md || true | |
| echo | |
| echo "=== aggregated/grand_summary.json (head 80) ===" | |
| head -80 aggregated/grand_summary.json || true | |
| - name: Upload aggregated tables (1-year retention) | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: sweep-aggregated-${{ needs.prep.outputs.sweep_id }} | |
| path: aggregated/ | |
| retention-days: 365 | |
| - name: Commit raw artifacts to results branch | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| BR: bench-results/sweep | |
| REPO: ${{ github.repository }} | |
| SWEEP_ID: ${{ needs.prep.outputs.sweep_id }} | |
| TS: ${{ needs.prep.outputs.ts }} | |
| INPUT_TAU: '0.12' | |
| INPUT_CBF: '0.5' | |
| INPUT_TIMEOUT: '600' | |
| INPUT_MANIFESTS_SUBDIR: v1 | |
| run: | | |
| git config user.name 'github-actions[bot]' | |
| git config user.email 'github-actions[bot]@users.noreply.github.com' | |
| git clone --depth 1 --branch main \ | |
| "https://x-access-token:${GITHUB_TOKEN}@github.com/${REPO}.git" \ | |
| sidecar | |
| cd sidecar | |
| if git ls-remote --heads origin "${BR}" | grep -q "${BR}"; then | |
| git fetch origin "${BR}":"${BR}" | |
| git checkout "${BR}" | |
| else | |
| git checkout --orphan "${BR}" | |
| git rm -rf . 2>/dev/null || true | |
| fi | |
| mkdir -p "${SWEEP_ID}" | |
| cp -r ../aggregated "${SWEEP_ID}/aggregated" | |
| cp -r ../all_cells "${SWEEP_ID}/cells" | |
| cat > "${SWEEP_ID}/README.md" <<EOF | |
| # Sweep ${SWEEP_ID} | |
| - Triggered: ${TS} | |
| - Source SHA: ${GITHUB_SHA} | |
| - Manifests: ${INPUT_MANIFESTS_SUBDIR} | |
| - tau=${INPUT_TAU}, core_budget_fraction=${INPUT_CBF} | |
| - Per-instance timeout: ${INPUT_TIMEOUT}s | |
| See \`aggregated/SWEEP_TABLE.md\` and \`cells/\` for raw per-instance JSONL checkpoints. | |
| EOF | |
| git add -A | |
| git commit -m "sweep ${SWEEP_ID} (source ${GITHUB_SHA})" || echo "nothing to commit" | |
| for attempt in 1 2 3; do | |
| echo "Push attempt ${attempt}/3..." | |
| git pull --rebase origin "${BR}" 2>/dev/null || true | |
| if git push -u origin "${BR}"; then | |
| break | |
| else | |
| sleep 10 | |
| fi | |
| done | |
| # ============================================================================ | |
| # 5. Deregister runners and destroy Hetzner server — full mode only | |
| # ============================================================================ | |
| cleanup: | |
| needs: [sweep-full] | |
| if: always() && inputs.mode == 'full' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - name: Deregister self-hosted runners | |
| env: | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| REPO: ${{ github.repository }} | |
| SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }} | |
| run: | | |
| set -euo pipefail | |
| RUNNER_IDS=$(curl -sf \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \ | |
| | jq --arg pfx "${SERVER_NAME}" \ | |
| '[.runners[] | select(.name | startswith($pfx)) | .id]') | |
| echo "Runner IDs to remove: ${RUNNER_IDS}" | |
| echo "${RUNNER_IDS}" | jq -r '.[]' | while read -r rid; do | |
| echo " Deleting runner ${rid}..." | |
| curl -sf -X DELETE \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${REPO}/actions/runners/${rid}" || true | |
| done | |
| - name: Download server_id artifact | |
| uses: actions/download-artifact@v8 | |
| with: | |
| name: hetzner-server-id | |
| path: /tmp/hetzner | |
| - name: Delete Hetzner server | |
| env: | |
| HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} | |
| run: | | |
| set -euo pipefail | |
| SERVER_ID=$(cat /tmp/hetzner/server_id.txt) | |
| if [ -z "${SERVER_ID}" ] || [ "${SERVER_ID}" = "null" ]; then | |
| echo "No server ID found — nothing to delete." | |
| exit 0 | |
| fi | |
| echo "Deleting Hetzner server ${SERVER_ID}..." | |
| curl -sf -X DELETE "https://api.hetzner.cloud/v1/servers/${SERVER_ID}" \ | |
| -H "Authorization: Bearer ${HCLOUD_TOKEN}" || true | |
| echo "Server ${SERVER_ID} deletion requested." |