Skip to content

bench-sweep

bench-sweep #44

Workflow file for this run

name: bench-sweep
# Multi-runner parallel sweep across method × budget × test_set.
#
# Two modes (select at dispatch):
# smoke — GitHub-hosted ubuntu-latest, 1 cell (ppr×64000×swebench_verified),
# 5 instances, no Hetzner provisioning. Quick sanity check.
# full — Hetzner CCX63 (4 × self-hosted runners), complete 5×5×3 matrix,
# baked repo cache. Production sweep.
#
# Trigger via workflow_dispatch. Results are committed to bench-results/sweep.
'on':
workflow_dispatch:
inputs:
mode:
description: 'smoke = GH Actions, 5 instances | full = Hetzner CCX63, complete sweep'
required: true
default: smoke
type: choice
options: [smoke, full]
concurrency:
group: bench-sweep
cancel-in-progress: false
env:
REGISTRY: ghcr.io
IMAGE_REF: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest
MANIFESTS_SUBDIR: v1
TIMEOUT_PER_INSTANCE: '600'
TAU: '0.12'
CORE_BUDGET_FRACTION: '0.5'
RESULTS_BRANCH: bench-results/sweep
SERVER_TAG: bench-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
# ============================================================================
# 1. Compute a stable sweep_id used as the directory name for raw artifacts
# ============================================================================
prep:
runs-on: ubuntu-latest
permissions:
contents: read
outputs:
sweep_id: ${{ steps.id.outputs.sweep_id }}
ts: ${{ steps.id.outputs.ts }}
steps:
- uses: actions/checkout@v6
- id: id
run: |
ts=$(date -u +%Y%m%dT%H%M%SZ)
sha=$(git rev-parse --short HEAD)
echo "sweep_id=sweep-${ts}-${sha}" >> "$GITHUB_OUTPUT"
echo "ts=${ts}" >> "$GITHUB_OUTPUT"
# ============================================================================
# 2. Provision Hetzner CCX63 — full mode only
# ============================================================================
provision:
needs: prep
if: inputs.mode == 'full'
runs-on: ubuntu-latest
permissions:
actions: write
outputs:
server_id: ${{ steps.create.outputs.server_id }}
server_ip: ${{ steps.create.outputs.server_ip }}
steps:
- name: Create Hetzner CCX63 with 4 self-hosted runners
id: create
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
GH_TOKEN: ${{ secrets.GH_PAT }}
REPO: ${{ github.repository }}
SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }}
BENCH_SSH_PUBLIC_KEY: ${{ secrets.BENCH_SSH_PUBLIC_KEY }}
run: |
set -euo pipefail
REG_TOKEN=$(curl -sf -X POST \
-H "Authorization: Bearer ${GH_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${REPO}/actions/runners/registration-token" \
| jq -r '.token')
RUNNER_VER=$(curl -sf \
-H "Authorization: Bearer ${GH_TOKEN}" \
"https://api.github.com/repos/actions/runner/releases/latest" \
| jq -r '.tag_name[1:]')
REPO_URL="https://github.com/${REPO}"
cat > /tmp/cloud-init.sh << ENDINIT
#!/bin/bash
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y docker.io curl jq
systemctl enable docker && systemctl start docker
mkdir -p /root/.ssh && chmod 700 /root/.ssh
echo "${BENCH_SSH_PUBLIC_KEY}" >> /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys
chage -M -1 root
useradd -m -s /bin/bash github
usermod -aG docker github
echo "${GH_TOKEN}" | docker login ghcr.io -u nikolay-e --password-stdin
docker pull ghcr.io/nikolay-e/treemapper-bench:latest
nohup bash -c '
mkdir -p /data/bench_repos
BAKE_LOG=/data/bench_repos/bake.log
if docker run --rm \
--entrypoint python3 \
-v /data/bench_repos:/cache/contextbench_repos \
-e BAKE_PARALLELISM=16 \
ghcr.io/nikolay-e/treemapper-bench:latest \
/app/scripts/bake_bench_cache.py /cache/contextbench_repos \
>> "\${BAKE_LOG}" 2>&1; then
touch /data/bench_repos/.ready
echo "[bake] OK at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}"
else
rc=\$?
touch /data/bench_repos/.bake_failed
echo "[bake] FAILED rc=\${rc} at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}"
fi
' &
curl -sL -o /tmp/runner.tar.gz \
"https://github.com/actions/runner/releases/download/v${RUNNER_VER}/actions-runner-linux-x64-${RUNNER_VER}.tar.gz"
for N in 1 2 3 4; do
mkdir -p "/home/github/runner-\${N}"
tar xzf /tmp/runner.tar.gz -C "/home/github/runner-\${N}"
chown -R github:github "/home/github/runner-\${N}"
sudo -u github bash -c "
cd /home/github/runner-\${N} &&
./config.sh \
--url '${REPO_URL}' \
--token '${REG_TOKEN}' \
--name '${SERVER_NAME}-\${N}' \
--labels 'self-hosted,large' \
--unattended \
--replace"
nohup sudo -u github bash -c \
"cd /home/github/runner-\${N} && ./run.sh" \
>> "/home/github/runner-\${N}/runner.log" 2>&1 &
done
ENDINIT
RESPONSE=$(curl -sf -X POST "https://api.hetzner.cloud/v1/servers" \
-H "Authorization: Bearer ${HCLOUD_TOKEN}" \
-H "Content-Type: application/json" \
-d "{
\"name\": \"${SERVER_NAME}\",
\"server_type\": \"ccx63\",
\"image\": \"ubuntu-24.04\",
\"location\": \"nbg1\",
\"user_data\": $(python3 -c "import json,sys; print(json.dumps(open('/tmp/cloud-init.sh').read()))")
}")
SERVER_ID=$(echo "${RESPONSE}" | jq -r '.server.id')
SERVER_IP=$(echo "${RESPONSE}" | jq -r '.server.public_net.ipv4.ip')
echo "server_id=${SERVER_ID}" >> "${GITHUB_OUTPUT}"
echo "server_ip=${SERVER_IP}" >> "${GITHUB_OUTPUT}"
echo "${SERVER_ID}" > server_id.txt
echo "Provisioned Hetzner CCX63 server_id=${SERVER_ID} ip=${SERVER_IP} name=${SERVER_NAME}"
SSHCMD="ssh root@${SERVER_IP} -i <(security find-generic-password -s treemapper-bench-ssh-key -a \$USER -w)"
echo "SSH: ${SSHCMD}"
- name: Upload server_id for cleanup workflow
uses: actions/upload-artifact@v7
with:
name: hetzner-server-id
path: server_id.txt
retention-days: 7
- name: Wait for all 4 runners to come online
env:
GH_TOKEN: ${{ secrets.GH_PAT }}
REPO: ${{ github.repository }}
SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }}
run: |
set -euo pipefail
echo "Polling for 4 runners prefixed '${SERVER_NAME}'..."
DEADLINE=$((SECONDS + 900))
while [[ ${SECONDS} -lt ${DEADLINE} ]]; do
ONLINE=$(curl -sf \
-H "Authorization: Bearer ${GH_TOKEN}" \
"https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \
| jq --arg pfx "${SERVER_NAME}" \
'[.runners[] | select(.name | startswith($pfx)) | select(.status == "online")] | length')
echo " Online runners: ${ONLINE}/4"
[[ "${ONLINE}" -ge 4 ]] && echo "All 4 runners ready." && exit 0
sleep 30
done
echo "ERROR: runners did not come online within 15 minutes." >&2
exit 1
# ============================================================================
# 3a. Smoke sweep — ubuntu-latest, minimal matrix, 5 instances, no bake cache
# ============================================================================
sweep-smoke:
needs: [prep]
if: inputs.mode == 'smoke'
permissions:
contents: read
packages: read
strategy:
fail-fast: false
matrix:
method: [ppr, ego, bm25, aider]
budget: [64000]
test_set: [swebench_verified]
runs-on: ubuntu-latest
timeout-minutes: 60
container:
image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: >-
--cpus 2
--memory 6g
--memory-swap 6g
env:
CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }}
CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }}
MANIFESTS_DIR: benchmarks/manifests/v1
INPUT_TAU: '0.12'
INPUT_CBF: '0.5'
INPUT_TIMEOUT: '120'
INPUT_MANIFESTS_SUBDIR: v1
steps:
- uses: actions/checkout@v6
- name: Cell start metadata
run: |
mkdir -p "${CELL_DIR}"
cat > "${CELL_DIR}/metadata.json" <<JSON
{
"sweep_id": "${{ needs.prep.outputs.sweep_id }}",
"mode": "smoke",
"cell": {
"method": "${{ matrix.method }}",
"budget": ${{ matrix.budget }},
"test_set": "${{ matrix.test_set }}",
"tau": ${INPUT_TAU},
"core_budget_fraction": ${INPUT_CBF}
},
"git": {
"sha": "${{ github.sha }}",
"ref": "${{ github.ref }}",
"actor": "${{ github.actor }}"
},
"started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"runner": {
"os": "${RUNNER_OS}",
"arch": "${RUNNER_ARCH}",
"name": "${RUNNER_NAME}",
"image": "${IMAGE_REF}"
},
"stage": "started"
}
JSON
- name: Build per-cell manifest dir
run: |
mkdir -p /tmp/manifest_one
cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/
- name: Resolve baseline + scoring
id: cfg
run: |
method="${{ matrix.method }}"
case "$method" in
ppr|ego)
echo "baseline=diffctx" >> "$GITHUB_OUTPUT"
echo "scoring=$method" >> "$GITHUB_OUTPUT"
;;
bm25)
echo "baseline=bm25" >> "$GITHUB_OUTPUT"
echo "scoring=ego" >> "$GITHUB_OUTPUT"
;;
aider)
echo "baseline=aider_fair" >> "$GITHUB_OUTPUT"
echo "scoring=ego" >> "$GITHUB_OUTPUT"
;;
esac
- name: Build winner.json
run: |
cat > /tmp/winner.json <<JSON
{
"winner": {
"tau": ${INPUT_TAU},
"core_budget_fraction": ${INPUT_CBF},
"budget": ${{ matrix.budget }},
"scoring": "${SCORING}",
"extra_env": {}
}
}
JSON
env:
SCORING: ${{ steps.cfg.outputs.scoring }}
- name: Install uv (required by aider baseline)
run: pip install uv
- name: Run cell (smoke — 5 instances)
timeout-minutes: 50
shell: bash
run: |
set -o pipefail
python -m benchmarks.run_final_eval \
--baseline "${BASELINE}" \
--winner /tmp/winner.json \
--manifests-dir /tmp/manifest_one \
--workers 1 \
--limit 5 \
--timeout-per-instance "${INPUT_TIMEOUT}" \
--min-memory-gb 2 \
--min-disk-gb 5 \
--out "${CELL_DIR}" 2>&1 | tee "${CELL_DIR}/run.log"
env:
BASELINE: ${{ steps.cfg.outputs.baseline }}
- name: Per-instance metric summary (always)
if: always()
run: |
ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl"
if [ -f "${ckpt}" ]; then
python - <<PY > "${CELL_DIR}/cell_summary.json"
import json, statistics, sys
rows = []
with open("${ckpt}") as f:
for line in f:
line = line.strip()
if line:
rows.append(json.loads(line))
n = len(rows)
if n == 0:
print(json.dumps({"n": 0}))
else:
recall = [r.get("file_recall", 0) for r in rows]
precision = [r.get("file_precision", 0) for r in rows]
elapsed = [r.get("elapsed_seconds", 0) for r in rows]
tokens = [r.get("used_tokens", 0) for r in rows]
# fragment_recall / line_f1 are populated only when the dataset
# ships gold_fragments (ContextBench, PolyBench when CST nodes
# exist); skip nulls so the aggregator gets honest means rather
# than zero-padded ones.
frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None]
line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None]
ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok")
statuses = {}
errors = {}
for r in rows:
s = (r.get("extra") or {}).get("status", "missing")
statuses[s] = statuses.get(s, 0) + 1
if s not in ("ok",):
err = str((r.get("extra") or {}).get("error", ""))
if err:
errors[err] = errors.get(err, 0) + 1
out = {
"n": n,
"ok": ok,
"ok_pct": 100.0 * ok / n if n else 0.0,
"statuses": statuses,
"errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]),
"file_recall": {
"mean": statistics.fmean(recall),
"median": statistics.median(recall),
},
"file_precision": {"mean": statistics.fmean(precision)},
"fragment_recall": (
{"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)}
if frag_recall else None
),
"line_f1": (
{"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)}
if line_f1 else None
),
"elapsed_seconds": {"mean": statistics.fmean(elapsed), "total": sum(elapsed)},
"used_tokens": {"mean": statistics.fmean(tokens)},
}
print(json.dumps(out, indent=2))
if errors:
print("\n=== ERROR BREAKDOWN ===", file=sys.stderr)
for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]):
print(f" [{cnt}x] {msg[:200]}", file=sys.stderr)
PY
cat "${CELL_DIR}/cell_summary.json"
else
printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \
> "${CELL_DIR}/cell_summary.json"
fi
- name: Append cell-end metadata
if: always()
run: |
end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
if command -v jq >/dev/null 2>&1; then
tmp=$(mktemp)
jq --arg end "${end_ts}" --arg status "${{ job.status }}" \
'. + {ended_at_utc: $end, exit_status: $status}' \
"${CELL_DIR}/metadata.json" > "${tmp}" \
&& mv "${tmp}" "${CELL_DIR}/metadata.json"
fi
- name: Upload cell artifact
if: always()
uses: actions/upload-artifact@v7
with:
name: cell-${{ matrix.method }}-b${{ matrix.budget }}-${{ matrix.test_set }}
path: ${{ env.CELL_DIR }}/
retention-days: 30
if-no-files-found: warn
# ============================================================================
# 3b. Full sweep — Hetzner CCX63, complete 5×5×3 matrix, baked repo cache
# ============================================================================
sweep-full:
needs: [prep, provision]
if: inputs.mode == 'full'
permissions:
contents: read
packages: read
strategy:
fail-fast: false
# 4 self-hosted runners on the CCX63 host; the queue can only drain at
# that rate anyway, so cap the matrix-level parallelism to match.
max-parallel: 4
matrix:
method: [ppr, ego, bm25, aider]
# B=0 is a recall-floor sanity bound (must report ≈ 0); B=-1 is the
# unlimited-budget ceiling. The 5 paying budgets {8k, 16k, 32k, 64k, 128k}
# form the budget curve referenced in STATS_PLAN.md.
budget: [-1, 0, 8000, 16000, 32000, 64000, 128000]
test_set: [contextbench_verified, polybench500, swebench_verified]
exclude:
- {method: bm25, budget: -1}
- {method: aider, budget: -1}
runs-on: [self-hosted, large]
timeout-minutes: 350
container:
image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: >-
--cpus 12
--memory 44g
--memory-swap 44g
--shm-size 8g
--pids-limit -1
--ulimit nofile=1048576:1048576
--ulimit nproc=65535:65535
-v /data/bench_repos:/cache/contextbench_repos
env:
CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }}
CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }}
MANIFESTS_DIR: benchmarks/manifests/v1
INPUT_TAU: '0.12'
INPUT_CBF: '0.5'
INPUT_TIMEOUT: '600'
INPUT_MANIFESTS_SUBDIR: v1
steps:
- uses: actions/checkout@v6
- name: Cell start metadata (write FIRST, before any failure point)
run: |
mkdir -p "${CELL_DIR}"
cat > "${CELL_DIR}/metadata.json" <<JSON
{
"sweep_id": "${{ needs.prep.outputs.sweep_id }}",
"mode": "full",
"cell": {
"method": "${{ matrix.method }}",
"budget": ${{ matrix.budget }},
"test_set": "${{ matrix.test_set }}",
"tau": ${INPUT_TAU},
"core_budget_fraction": ${INPUT_CBF}
},
"git": {
"sha": "${{ github.sha }}",
"ref": "${{ github.ref }}",
"actor": "${{ github.actor }}"
},
"started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"runner": {
"os": "${RUNNER_OS}",
"arch": "${RUNNER_ARCH}",
"name": "${RUNNER_NAME}",
"image": "${IMAGE_REF}"
},
"stage": "started"
}
JSON
- name: Wait for shared repo cache
run: |
while true; do
if [ -f /cache/contextbench_repos/.bake_failed ]; then
echo "=== BAKE FAILED — full log ==="
cat /cache/contextbench_repos/bake.log || echo "(bake.log not found)"
echo "=== end of bake.log ==="
exit 1
fi
if [ -f /cache/contextbench_repos/.ready ]; then
echo "Repo cache ready."
break
fi
echo "Waiting for repo cache extraction to complete..."
sleep 15
done
- name: Validate bake manifest and purge known-bad repos
shell: bash
run: |
set -euo pipefail
MANIFEST=/cache/contextbench_repos/.bake_manifest.json
if [ ! -f "${MANIFEST}" ]; then
echo "WARN: no .bake_manifest.json found — proceeding without preflight purge"
exit 0
fi
python3 - <<'PY'
import json, shutil
from pathlib import Path
base = Path("/cache/contextbench_repos")
manifest = json.loads((base / ".bake_manifest.json").read_text())
purged = []
for entry in manifest.get("failed_clones", []):
repo = entry["repo"]
cache_dir = base / repo.replace("/", "__")
if cache_dir.exists():
shutil.rmtree(cache_dir, ignore_errors=True)
purged.append(repo)
print(f"Purged {len(purged)} bake-failed repos: {purged[:10]}{'...' if len(purged) > 10 else ''}")
fc = len(manifest.get("failed_clones", []))
ff = len(manifest.get("failed_fetches", []))
print(f"Bake manifest: failed_clones={fc} failed_fetches={ff}")
PY
- name: System info dump
run: |
{
echo "=== uname -a ==="; uname -a
echo; echo "=== /proc/cpuinfo (1 core) ==="; awk '/^processor/{p++} p<2' /proc/cpuinfo
echo; echo "=== /proc/meminfo (head) ==="; head -5 /proc/meminfo
echo; echo "=== df -h ==="; df -h
echo; echo "=== python ==="; python --version
echo; echo "=== pip freeze (head 30) ==="; pip freeze 2>/dev/null | head -30 || true
echo; echo "=== rustc ==="; rustc --version 2>/dev/null || echo "no rustc"
echo; echo "=== cargo ==="; cargo --version 2>/dev/null || echo "no cargo"
echo; echo "=== git ==="; git --version
} > "${CELL_DIR}/system_info.log" 2>&1 || true
- name: Build per-cell single-test-set manifest dir
run: |
mkdir -p /tmp/manifest_one
cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/
- name: Resolve baseline + scoring
id: cfg
run: |
method="${{ matrix.method }}"
case "$method" in
ppr|ego)
echo "baseline=diffctx" >> "$GITHUB_OUTPUT"
echo "scoring=$method" >> "$GITHUB_OUTPUT"
;;
bm25)
echo "baseline=bm25" >> "$GITHUB_OUTPUT"
echo "scoring=ego" >> "$GITHUB_OUTPUT"
;;
aider)
echo "baseline=aider_fair" >> "$GITHUB_OUTPUT"
echo "scoring=ego" >> "$GITHUB_OUTPUT"
;;
esac
- name: Build winner.json
run: |
cat > /tmp/winner.json <<JSON
{
"winner": {
"tau": ${INPUT_TAU},
"core_budget_fraction": ${INPUT_CBF},
"budget": ${{ matrix.budget }},
"scoring": "${SCORING}",
"extra_env": {}
}
}
JSON
env:
SCORING: ${{ steps.cfg.outputs.scoring }}
- name: Install uv (required by aider baseline; safe no-op for others)
run: pip install --no-cache-dir uv
- name: Write checkpoint heartbeat script
run: |
cat > /tmp/heartbeat.py << 'PYEOF'
import os, sys, time, json, pathlib, urllib.request
CKPT = pathlib.Path("${{ env.CELL_DIR }}/${{ matrix.test_set }}.checkpoint.jsonl")
ARTIFACT_NAME = "heartbeat-${{ env.CELL_TAG }}"
def try_upload():
if not CKPT.exists():
return
rt_url = os.environ.get("ACTIONS_RUNTIME_URL", "")
rt_tok = os.environ.get("ACTIONS_RUNTIME_TOKEN", "")
run_id = os.environ.get("GITHUB_RUN_ID", "")
if not (rt_url and rt_tok and run_id):
return
payload = json.dumps({"type": "actions_storage", "name": ARTIFACT_NAME}).encode()
req = urllib.request.Request(
f"{rt_url}_apis/pipelines/workflows/{run_id}/artifacts?api-version=6.0-preview",
data=payload,
headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
resp = json.loads(r.read())
except Exception as e:
print(f"[heartbeat] create: {e}", file=sys.stderr, flush=True)
return
container_url = resp.get("fileContainerResourceUrl", "")
if not container_url:
return
blob = CKPT.read_bytes()
put_req = urllib.request.Request(
f"{container_url}?itemPath={ARTIFACT_NAME}/checkpoint.jsonl",
data=blob,
headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/octet-stream"},
method="PUT",
)
try:
urllib.request.urlopen(put_req, timeout=120)
print(f"[heartbeat] uploaded {len(blob)} bytes", flush=True)
except Exception as e:
print(f"[heartbeat] put: {e}", file=sys.stderr, flush=True)
while True:
time.sleep(600)
try:
try_upload()
except Exception as e:
print(f"[heartbeat] loop: {e}", file=sys.stderr, flush=True)
PYEOF
- name: Run cell (logged to run.log)
timeout-minutes: 330
shell: bash
run: |
set -o pipefail
python3 /tmp/heartbeat.py >> "${CELL_DIR}/heartbeat.log" 2>&1 &
HEARTBEAT_PID=$!
python -m benchmarks.run_final_eval \
--baseline "${BASELINE}" \
--winner /tmp/winner.json \
--manifests-dir /tmp/manifest_one \
--workers 10 \
--timeout-per-instance "${INPUT_TIMEOUT}" \
--min-memory-gb 32 \
--min-disk-gb 50 \
--out "${CELL_DIR}" 2>&1 | tee "${CELL_DIR}/run.log"
kill "${HEARTBEAT_PID}" 2>/dev/null || true
env:
BASELINE: ${{ steps.cfg.outputs.baseline }}
- name: Per-instance metric summary (always)
if: always()
run: |
ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl"
if [ -f "${ckpt}" ]; then
python - <<PY > "${CELL_DIR}/cell_summary.json"
import json, statistics, sys
rows = []
with open("${ckpt}") as f:
for line in f:
line = line.strip()
if line:
rows.append(json.loads(line))
n = len(rows)
if n == 0:
print(json.dumps({"n": 0}))
else:
recall = [r.get("file_recall", 0) for r in rows]
precision = [r.get("file_precision", 0) for r in rows]
elapsed = [r.get("elapsed_seconds", 0) for r in rows]
tokens = [r.get("used_tokens", 0) for r in rows]
# fragment_recall / line_f1 are populated only when the dataset
# ships gold_fragments (ContextBench, PolyBench when CST nodes
# exist); skip nulls so the aggregator gets honest means rather
# than zero-padded ones.
frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None]
line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None]
ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok")
statuses = {}
errors = {}
for r in rows:
s = (r.get("extra") or {}).get("status", "missing")
statuses[s] = statuses.get(s, 0) + 1
if s not in ("ok",):
err = str((r.get("extra") or {}).get("error", ""))
if err:
errors[err] = errors.get(err, 0) + 1
out = {
"n": n,
"ok": ok,
"ok_pct": 100.0 * ok / n if n else 0.0,
"statuses": statuses,
"errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]),
"file_recall": {
"mean": statistics.fmean(recall),
"median": statistics.median(recall),
"p25": sorted(recall)[max(0, n*1//4-1)],
"p75": sorted(recall)[min(n-1, n*3//4)],
},
"file_precision": {"mean": statistics.fmean(precision)},
"fragment_recall": (
{"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)}
if frag_recall else None
),
"line_f1": (
{"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)}
if line_f1 else None
),
"elapsed_seconds": {
"mean": statistics.fmean(elapsed),
"max": max(elapsed),
"total": sum(elapsed),
},
"used_tokens": {
"mean": statistics.fmean(tokens),
"max": max(tokens),
},
}
print(json.dumps(out, indent=2))
if errors:
print("\n=== ERROR BREAKDOWN ===", file=sys.stderr)
for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]):
print(f" [{cnt}x] {msg[:200]}", file=sys.stderr)
PY
cat "${CELL_DIR}/cell_summary.json"
else
printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \
> "${CELL_DIR}/cell_summary.json"
ls -la "${CELL_DIR}" || true
fi
- name: Append cell-end metadata
if: always()
run: |
end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
if command -v jq >/dev/null 2>&1; then
tmp=$(mktemp)
jq --arg end "${end_ts}" --arg status "${{ job.status }}" --arg disk "$(df -h / | tail -1)" \
'. + {ended_at_utc: $end, exit_status: $status, runner_disk_at_end: $disk}' \
"${CELL_DIR}/metadata.json" > "${tmp}" \
&& mv "${tmp}" "${CELL_DIR}/metadata.json"
fi
cat "${CELL_DIR}/metadata.json"
- name: Upload cell artifact (90-day retention)
if: always()
uses: actions/upload-artifact@v7
with:
name: cell-${{ matrix.method }}-b${{ matrix.budget }}-${{ matrix.test_set }}
path: ${{ env.CELL_DIR }}/
retention-days: 90
if-no-files-found: warn
# ============================================================================
# 4. Aggregate all cells into per-cell summary table + raw artifact archive.
# ============================================================================
aggregate:
needs: [prep, sweep-smoke, sweep-full]
if: always()
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install deps
run: pip install numpy scipy
- name: Download all cell artifacts
uses: actions/download-artifact@v8
with:
path: all_cells
pattern: cell-*
- name: Aggregate per-cell summaries
run: |
mkdir -p aggregated
python -m benchmarks.aggregate_sweep \
--cells-dir all_cells \
--sweep-id "${{ needs.prep.outputs.sweep_id }}" \
--out aggregated/
- name: Show aggregate summary
run: |
echo "=== aggregated/ ==="
ls -la aggregated/
echo
echo "=== aggregated/SWEEP_TABLE.md ==="
cat aggregated/SWEEP_TABLE.md || true
echo
echo "=== aggregated/grand_summary.json (head 80) ==="
head -80 aggregated/grand_summary.json || true
- name: Upload aggregated tables (1-year retention)
uses: actions/upload-artifact@v7
with:
name: sweep-aggregated-${{ needs.prep.outputs.sweep_id }}
path: aggregated/
retention-days: 365
- name: Commit raw artifacts to results branch
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
BR: bench-results/sweep
REPO: ${{ github.repository }}
SWEEP_ID: ${{ needs.prep.outputs.sweep_id }}
TS: ${{ needs.prep.outputs.ts }}
INPUT_TAU: '0.12'
INPUT_CBF: '0.5'
INPUT_TIMEOUT: '600'
INPUT_MANIFESTS_SUBDIR: v1
run: |
git config user.name 'github-actions[bot]'
git config user.email 'github-actions[bot]@users.noreply.github.com'
git clone --depth 1 --branch main \
"https://x-access-token:${GITHUB_TOKEN}@github.com/${REPO}.git" \
sidecar
cd sidecar
if git ls-remote --heads origin "${BR}" | grep -q "${BR}"; then
git fetch origin "${BR}":"${BR}"
git checkout "${BR}"
else
git checkout --orphan "${BR}"
git rm -rf . 2>/dev/null || true
fi
mkdir -p "${SWEEP_ID}"
cp -r ../aggregated "${SWEEP_ID}/aggregated"
cp -r ../all_cells "${SWEEP_ID}/cells"
cat > "${SWEEP_ID}/README.md" <<EOF
# Sweep ${SWEEP_ID}
- Triggered: ${TS}
- Source SHA: ${GITHUB_SHA}
- Manifests: ${INPUT_MANIFESTS_SUBDIR}
- tau=${INPUT_TAU}, core_budget_fraction=${INPUT_CBF}
- Per-instance timeout: ${INPUT_TIMEOUT}s
See \`aggregated/SWEEP_TABLE.md\` and \`cells/\` for raw per-instance JSONL checkpoints.
EOF
git add -A
git commit -m "sweep ${SWEEP_ID} (source ${GITHUB_SHA})" || echo "nothing to commit"
for attempt in 1 2 3; do
echo "Push attempt ${attempt}/3..."
git pull --rebase origin "${BR}" 2>/dev/null || true
if git push -u origin "${BR}"; then
break
else
sleep 10
fi
done
# ============================================================================
# 5. Deregister runners and destroy Hetzner server — full mode only
# ============================================================================
cleanup:
needs: [sweep-full]
if: always() && inputs.mode == 'full'
runs-on: ubuntu-latest
permissions:
actions: write
steps:
- name: Deregister self-hosted runners
env:
GH_TOKEN: ${{ secrets.GH_PAT }}
REPO: ${{ github.repository }}
SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }}
run: |
set -euo pipefail
RUNNER_IDS=$(curl -sf \
-H "Authorization: Bearer ${GH_TOKEN}" \
"https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \
| jq --arg pfx "${SERVER_NAME}" \
'[.runners[] | select(.name | startswith($pfx)) | .id]')
echo "Runner IDs to remove: ${RUNNER_IDS}"
echo "${RUNNER_IDS}" | jq -r '.[]' | while read -r rid; do
echo " Deleting runner ${rid}..."
curl -sf -X DELETE \
-H "Authorization: Bearer ${GH_TOKEN}" \
"https://api.github.com/repos/${REPO}/actions/runners/${rid}" || true
done
- name: Download server_id artifact
uses: actions/download-artifact@v8
with:
name: hetzner-server-id
path: /tmp/hetzner
- name: Delete Hetzner server
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set -euo pipefail
SERVER_ID=$(cat /tmp/hetzner/server_id.txt)
if [ -z "${SERVER_ID}" ] || [ "${SERVER_ID}" = "null" ]; then
echo "No server ID found — nothing to delete."
exit 0
fi
echo "Deleting Hetzner server ${SERVER_ID}..."
curl -sf -X DELETE "https://api.hetzner.cloud/v1/servers/${SERVER_ID}" \
-H "Authorization: Bearer ${HCLOUD_TOKEN}" || true
echo "Server ${SERVER_ID} deletion requested."