bench-sweep #55

Workflow file for this run

.github/workflows/bench-sweep.yml at 6c28522

	name: bench-sweep

	# Multi-runner parallel sweep across method × budget × test_set.
	#
	# Two modes (select at dispatch):
	# smoke — GitHub-hosted ubuntu-latest, 1 cell (ppr×64000×swebench_verified),
	# 5 instances, no Hetzner provisioning. Quick sanity check.
	# full — Hetzner CCX63 (4 × self-hosted runners), complete 5×5×3 matrix,
	# baked repo cache. Production sweep.
	#
	# Trigger via workflow_dispatch. Results are committed to bench-results/sweep.

	'on':
	workflow_dispatch:
	inputs:
	mode:
	description: 'smoke = GH Actions, 5 instances \| full = Hetzner CCX63, complete sweep'
	required: true
	default: smoke
	type: choice
	options: [smoke, full]

	concurrency:
	group: bench-sweep
	cancel-in-progress: false

	env:
	REGISTRY: ghcr.io
	IMAGE_REF: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest
	MANIFESTS_SUBDIR: v1
	TIMEOUT_PER_INSTANCE: '600'
	TAU: '0.12'
	CORE_BUDGET_FRACTION: '0.5'
	RESULTS_BRANCH: bench-results/sweep
	SERVER_TAG: bench-${{ github.run_id }}-${{ github.run_attempt }}

	jobs:
	# ============================================================================
	# 1. Compute a stable sweep_id used as the directory name for raw artifacts
	# ============================================================================
	prep:
	runs-on: ubuntu-latest
	permissions:
	contents: read
	outputs:
	sweep_id: ${{ steps.id.outputs.sweep_id }}
	ts: ${{ steps.id.outputs.ts }}
	steps:
	- uses: actions/checkout@v6
	- id: id
	run: \|
	ts=$(date -u +%Y%m%dT%H%M%SZ)
	sha=$(git rev-parse --short HEAD)
	echo "sweep_id=sweep-${ts}-${sha}" >> "$GITHUB_OUTPUT"
	echo "ts=${ts}" >> "$GITHUB_OUTPUT"

	# ============================================================================
	# 2. Provision Hetzner CCX63 — full mode only
	# ============================================================================
	provision:
	needs: prep
	if: inputs.mode == 'full'
	runs-on: ubuntu-latest
	permissions:
	actions: write
	outputs:
	server_id: ${{ steps.create.outputs.server_id }}
	server_ip: ${{ steps.create.outputs.server_ip }}
	steps:
	- name: Create Hetzner CCX63 with 4 self-hosted runners
	id: create
	env:
	HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
	GH_TOKEN: ${{ secrets.GH_PAT }}
	REPO: ${{ github.repository }}
	SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }}
	BENCH_SSH_PUBLIC_KEY: ${{ secrets.BENCH_SSH_PUBLIC_KEY }}
	run: \|
	set -euo pipefail

	REG_TOKEN=$(curl -sf -X POST \
	-H "Authorization: Bearer ${GH_TOKEN}" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/${REPO}/actions/runners/registration-token" \
	\| jq -r '.token')

	RUNNER_VER=$(curl -sf \
	-H "Authorization: Bearer ${GH_TOKEN}" \
	"https://api.github.com/repos/actions/runner/releases/latest" \
	\| jq -r '.tag_name[1:]')

	REPO_URL="https://github.com/${REPO}"

	cat > /tmp/cloud-init.sh << ENDINIT
	#!/bin/bash
	set -euo pipefail
	export DEBIAN_FRONTEND=noninteractive
	apt-get update -y
	apt-get install -y docker.io curl jq
	systemctl enable docker && systemctl start docker
	mkdir -p /root/.ssh && chmod 700 /root/.ssh
	echo "${BENCH_SSH_PUBLIC_KEY}" >> /root/.ssh/authorized_keys
	chmod 600 /root/.ssh/authorized_keys
	chage -M -1 root
	useradd -m -s /bin/bash github
	usermod -aG docker github
	echo "${GH_TOKEN}" \| docker login ghcr.io -u nikolay-e --password-stdin
	docker pull ghcr.io/nikolay-e/treemapper-bench:latest
	nohup bash -c '
	mkdir -p /data/bench_repos
	BAKE_LOG=/data/bench_repos/bake.log
	if docker run --rm \
	--entrypoint python3 \
	-v /data/bench_repos:/cache/contextbench_repos \
	-e BAKE_PARALLELISM=16 \
	ghcr.io/nikolay-e/treemapper-bench:latest \
	/app/scripts/bake_bench_cache.py /cache/contextbench_repos \
	>> "\${BAKE_LOG}" 2>&1; then
	touch /data/bench_repos/.ready
	echo "[bake] OK at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}"
	else
	rc=\$?
	touch /data/bench_repos/.bake_failed
	echo "[bake] FAILED rc=\${rc} at \$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "\${BAKE_LOG}"
	fi
	' &
	curl -sL -o /tmp/runner.tar.gz \
	"https://github.com/actions/runner/releases/download/v${RUNNER_VER}/actions-runner-linux-x64-${RUNNER_VER}.tar.gz"
	for N in 1 2 3 4; do
	mkdir -p "/home/github/runner-\${N}"
	tar xzf /tmp/runner.tar.gz -C "/home/github/runner-\${N}"
	chown -R github:github "/home/github/runner-\${N}"
	sudo -u github bash -c "
	cd /home/github/runner-\${N} &&
	./config.sh \
	--url '${REPO_URL}' \
	--token '${REG_TOKEN}' \
	--name '${SERVER_NAME}-\${N}' \
	--labels 'self-hosted,large' \
	--unattended \
	--replace"
	nohup sudo -u github bash -c \
	"cd /home/github/runner-\${N} && ./run.sh" \
	>> "/home/github/runner-\${N}/runner.log" 2>&1 &
	done
	ENDINIT

	RESPONSE=$(curl -sf -X POST "https://api.hetzner.cloud/v1/servers" \
	-H "Authorization: Bearer ${HCLOUD_TOKEN}" \
	-H "Content-Type: application/json" \
	-d "{
	\"name\": \"${SERVER_NAME}\",
	\"server_type\": \"ccx63\",
	\"image\": \"ubuntu-24.04\",
	\"location\": \"nbg1\",
	\"user_data\": $(python3 -c "import json,sys; print(json.dumps(open('/tmp/cloud-init.sh').read()))")
	}")

	SERVER_ID=$(echo "${RESPONSE}" \| jq -r '.server.id')
	SERVER_IP=$(echo "${RESPONSE}" \| jq -r '.server.public_net.ipv4.ip')
	echo "server_id=${SERVER_ID}" >> "${GITHUB_OUTPUT}"
	echo "server_ip=${SERVER_IP}" >> "${GITHUB_OUTPUT}"
	echo "${SERVER_ID}" > server_id.txt
	echo "Provisioned Hetzner CCX63 server_id=${SERVER_ID} ip=${SERVER_IP} name=${SERVER_NAME}"
	SSHCMD="ssh root@${SERVER_IP} -i <(security find-generic-password -s treemapper-bench-ssh-key -a \$USER -w)"
	echo "SSH: ${SSHCMD}"

	- name: Upload server_id for cleanup workflow
	uses: actions/upload-artifact@v7
	with:
	name: hetzner-server-id
	path: server_id.txt
	retention-days: 7

	- name: Wait for all 4 runners to come online
	env:
	GH_TOKEN: ${{ secrets.GH_PAT }}
	REPO: ${{ github.repository }}
	SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }}
	run: \|
	set -euo pipefail
	echo "Polling for 4 runners prefixed '${SERVER_NAME}'..."
	DEADLINE=$((SECONDS + 900))
	while [[ ${SECONDS} -lt ${DEADLINE} ]]; do
	ONLINE=$(curl -sf \
	-H "Authorization: Bearer ${GH_TOKEN}" \
	"https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \
	\| jq --arg pfx "${SERVER_NAME}" \
	'[.runners[] \| select(.name \| startswith($pfx)) \| select(.status == "online")] \| length')
	echo " Online runners: ${ONLINE}/4"
	[[ "${ONLINE}" -ge 4 ]] && echo "All 4 runners ready." && exit 0
	sleep 30
	done
	echo "ERROR: runners did not come online within 15 minutes." >&2
	exit 1

	# ============================================================================
	# 3a. Smoke sweep — ubuntu-latest, minimal matrix, 5 instances, no bake cache
	# ============================================================================
	sweep-smoke:
	needs: [prep]
	if: inputs.mode == 'smoke'
	permissions:
	contents: read
	packages: read
	strategy:
	fail-fast: false
	matrix:
	method: [ppr, ego, bm25, aider]
	budget: [64000]
	test_set: [swebench_verified]
	runs-on: ubuntu-latest
	timeout-minutes: 60
	container:
	image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest
	credentials:
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}
	options: >-
	--cpus 2
	--memory 6g
	--memory-swap 6g
	env:
	CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }}
	CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_${{ matrix.test_set }}
	MANIFESTS_DIR: benchmarks/manifests/v1
	INPUT_TAU: '0.12'
	INPUT_CBF: '0.5'
	INPUT_TIMEOUT: '120'
	INPUT_MANIFESTS_SUBDIR: v1
	steps:
	- uses: actions/checkout@v6

	- name: Cell start metadata
	run: \|
	mkdir -p "${CELL_DIR}"
	cat > "${CELL_DIR}/metadata.json" <<JSON
	{
	"sweep_id": "${{ needs.prep.outputs.sweep_id }}",
	"mode": "smoke",
	"cell": {
	"method": "${{ matrix.method }}",
	"budget": ${{ matrix.budget }},
	"test_set": "${{ matrix.test_set }}",
	"tau": ${INPUT_TAU},
	"core_budget_fraction": ${INPUT_CBF}
	},
	"git": {
	"sha": "${{ github.sha }}",
	"ref": "${{ github.ref }}",
	"actor": "${{ github.actor }}"
	},
	"started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
	"runner": {
	"os": "${RUNNER_OS}",
	"arch": "${RUNNER_ARCH}",
	"name": "${RUNNER_NAME}",
	"image": "${IMAGE_REF}"
	},
	"stage": "started"
	}
	JSON

	- name: Build per-cell manifest dir
	run: \|
	mkdir -p /tmp/manifest_one
	cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/

	- name: Resolve baseline + scoring
	id: cfg
	run: \|
	method="${{ matrix.method }}"
	case "$method" in
	ppr\|ego)
	echo "baseline=diffctx" >> "$GITHUB_OUTPUT"
	echo "scoring=$method" >> "$GITHUB_OUTPUT"
	;;
	bm25)
	echo "baseline=bm25" >> "$GITHUB_OUTPUT"
	echo "scoring=ego" >> "$GITHUB_OUTPUT"
	;;
	aider)
	echo "baseline=aider_fair" >> "$GITHUB_OUTPUT"
	echo "scoring=ego" >> "$GITHUB_OUTPUT"
	;;
	esac

	- name: Build winner.json
	run: \|
	cat > /tmp/winner.json <<JSON
	{
	"winner": {
	"tau": ${INPUT_TAU},
	"core_budget_fraction": ${INPUT_CBF},
	"budget": ${{ matrix.budget }},
	"scoring": "${SCORING}",
	"extra_env": {}
	}
	}
	JSON
	env:
	SCORING: ${{ steps.cfg.outputs.scoring }}

	- name: Install uv (required by aider baseline)
	run: pip install uv

	- name: Run cell (smoke — 5 instances)
	timeout-minutes: 50
	shell: bash
	run: \|
	set -o pipefail
	python -m benchmarks.run_final_eval \
	--baseline "${BASELINE}" \
	--winner /tmp/winner.json \
	--manifests-dir /tmp/manifest_one \
	--workers 1 \
	--limit 5 \
	--timeout-per-instance "${INPUT_TIMEOUT}" \
	--min-memory-gb 2 \
	--min-disk-gb 5 \
	--out "${CELL_DIR}" 2>&1 \| tee "${CELL_DIR}/run.log"
	env:
	BASELINE: ${{ steps.cfg.outputs.baseline }}

	- name: Per-instance metric summary (always)
	if: always()
	run: \|
	ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl"
	if [ -f "${ckpt}" ]; then
	python - <<PY > "${CELL_DIR}/cell_summary.json"
	import json, statistics, sys
	rows = []
	with open("${ckpt}") as f:
	for line in f:
	line = line.strip()
	if line:
	rows.append(json.loads(line))
	n = len(rows)
	if n == 0:
	print(json.dumps({"n": 0}))
	else:
	recall = [r.get("file_recall", 0) for r in rows]
	precision = [r.get("file_precision", 0) for r in rows]
	elapsed = [r.get("elapsed_seconds", 0) for r in rows]
	tokens = [r.get("used_tokens", 0) for r in rows]
	# fragment_recall / line_f1 are populated only when the dataset
	# ships gold_fragments (ContextBench, PolyBench when CST nodes
	# exist); skip nulls so the aggregator gets honest means rather
	# than zero-padded ones.
	frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None]
	line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None]
	ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok")
	statuses = {}
	errors = {}
	for r in rows:
	s = (r.get("extra") or {}).get("status", "missing")
	statuses[s] = statuses.get(s, 0) + 1
	if s not in ("ok",):
	err = str((r.get("extra") or {}).get("error", ""))
	if err:
	errors[err] = errors.get(err, 0) + 1
	out = {
	"n": n,
	"ok": ok,
	"ok_pct": 100.0 * ok / n if n else 0.0,
	"statuses": statuses,
	"errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]),
	"file_recall": {
	"mean": statistics.fmean(recall),
	"median": statistics.median(recall),
	},
	"file_precision": {"mean": statistics.fmean(precision)},
	"fragment_recall": (
	{"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)}
	if frag_recall else None
	),
	"line_f1": (
	{"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)}
	if line_f1 else None
	),
	"elapsed_seconds": {"mean": statistics.fmean(elapsed), "total": sum(elapsed)},
	"used_tokens": {"mean": statistics.fmean(tokens)},
	}
	print(json.dumps(out, indent=2))
	if errors:
	print("\n=== ERROR BREAKDOWN ===", file=sys.stderr)
	for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]):
	print(f" [{cnt}x] {msg[:200]}", file=sys.stderr)
	PY
	cat "${CELL_DIR}/cell_summary.json"
	else
	printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \
	> "${CELL_DIR}/cell_summary.json"
	fi

	- name: Append cell-end metadata
	if: always()
	run: \|
	end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
	if command -v jq >/dev/null 2>&1; then
	tmp=$(mktemp)
	jq --arg end "${end_ts}" --arg status "${{ job.status }}" \
	'. + {ended_at_utc: $end, exit_status: $status}' \
	"${CELL_DIR}/metadata.json" > "${tmp}" \
	&& mv "${tmp}" "${CELL_DIR}/metadata.json"
	fi

	- name: Upload cell artifact
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: cell-${{ matrix.method }}-b${{ matrix.budget }}-${{ matrix.test_set }}
	path: ${{ env.CELL_DIR }}/
	retention-days: 30
	if-no-files-found: warn

	# ============================================================================
	# 3b. Full sweep — Hetzner CCX63, complete 5×5×3 matrix, baked repo cache
	# ============================================================================
	sweep-full:
	needs: [prep, provision]
	if: inputs.mode == 'full'
	permissions:
	contents: read
	packages: read
	strategy:
	fail-fast: false
	# 4 self-hosted runners on the CCX63 host; the queue can only drain at
	# that rate anyway, so cap the matrix-level parallelism to match.
	max-parallel: 4
	matrix:
	method: [ppr, ego, bm25, aider]
	# B=0 is a recall-floor sanity bound (must report ≈ 0); B=-1 is the
	# unlimited-budget ceiling. The 5 paying budgets {8k, 16k, 32k, 64k, 128k}
	# form the budget curve referenced in STATS_PLAN.md.
	budget: [-1, 0, 8000, 16000, 32000, 64000, 128000]
	# `depth` is the EGO graph traversal radius; only meaningful for
	# method=ego. PPR uses alpha (not depth), BM25 has no graph,
	# Aider is its own subprocess. Sentinel -1 means "depth does not
	# apply"; non-EGO methods run only with depth=-1.
	depth: [-1, 0, 1, 2, 3, 4]
	test_set: [contextbench_verified, polybench500, swebench_verified]
	exclude:
	- {method: bm25, budget: -1}
	- {method: aider, budget: -1}
	# Non-EGO methods: only the depth=-1 cell.
	- {method: ppr, depth: 0}
	- {method: ppr, depth: 1}
	- {method: ppr, depth: 2}
	- {method: ppr, depth: 3}
	- {method: ppr, depth: 4}
	- {method: bm25, depth: 0}
	- {method: bm25, depth: 1}
	- {method: bm25, depth: 2}
	- {method: bm25, depth: 3}
	- {method: bm25, depth: 4}
	- {method: aider, depth: 0}
	- {method: aider, depth: 1}
	- {method: aider, depth: 2}
	- {method: aider, depth: 3}
	- {method: aider, depth: 4}
	- {method: ego, depth: -1}
	runs-on: [self-hosted, large]
	timeout-minutes: 350
	container:
	image: ghcr.io/${{ github.repository_owner }}/treemapper-bench:latest
	credentials:
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}
	options: >-
	--cpus 12
	--memory 44g
	--memory-swap 44g
	--shm-size 8g
	--pids-limit -1
	--ulimit nofile=1048576:1048576
	--ulimit nproc=65535:65535
	-v /data/bench_repos:/cache/contextbench_repos
	env:
	CELL_TAG: ${{ matrix.method }}_b${{ matrix.budget }}_L${{ matrix.depth }}_${{ matrix.test_set }}
	CELL_DIR: results/sweep/${{ matrix.method }}_b${{ matrix.budget }}_L${{ matrix.depth }}_${{ matrix.test_set }}
	MANIFESTS_DIR: benchmarks/manifests/v1
	INPUT_TAU: '0.12'
	INPUT_CBF: '0.5'
	INPUT_TIMEOUT: '600'
	INPUT_MANIFESTS_SUBDIR: v1
	steps:
	- uses: actions/checkout@v6

	- name: Cell start metadata (write FIRST, before any failure point)
	run: \|
	mkdir -p "${CELL_DIR}"
	cat > "${CELL_DIR}/metadata.json" <<JSON
	{
	"sweep_id": "${{ needs.prep.outputs.sweep_id }}",
	"mode": "full",
	"cell": {
	"method": "${{ matrix.method }}",
	"budget": ${{ matrix.budget }},
	"depth": ${{ matrix.depth }},
	"test_set": "${{ matrix.test_set }}",
	"tau": ${INPUT_TAU},
	"core_budget_fraction": ${INPUT_CBF}
	},
	"git": {
	"sha": "${{ github.sha }}",
	"ref": "${{ github.ref }}",
	"actor": "${{ github.actor }}"
	},
	"started_at_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
	"runner": {
	"os": "${RUNNER_OS}",
	"arch": "${RUNNER_ARCH}",
	"name": "${RUNNER_NAME}",
	"image": "${IMAGE_REF}"
	},
	"stage": "started"
	}
	JSON

	- name: Wait for shared repo cache
	run: \|
	while true; do
	if [ -f /cache/contextbench_repos/.bake_failed ]; then
	echo "=== BAKE FAILED — full log ==="
	cat /cache/contextbench_repos/bake.log \|\| echo "(bake.log not found)"
	echo "=== end of bake.log ==="
	exit 1
	fi
	if [ -f /cache/contextbench_repos/.ready ]; then
	echo "Repo cache ready."
	break
	fi
	echo "Waiting for repo cache extraction to complete..."
	sleep 15
	done

	- name: Reclaim disk — purge own runner's worktrees, prune bare caches
	shell: bash
	run: \|
	set -euo pipefail
	base=/cache/contextbench_repos
	# `_ensure_worker_state` names worktrees `worktrees/w<runner>_<pid>/`
	# using `RUNNER_NAME` (with `-` and space mapped to `_`). Four
	# self-hosted runners share `/data/bench_repos` on the CCX63 host,
	# so a blanket `rm -rf worktrees/` would corrupt concurrent cells
	# on the other runners. Restrict to dirs prefixed with our own
	# runner name so cleanup is single-runner-scoped.
	runner_slug="$(printf '%s' "${RUNNER_NAME:-unknown}" \| tr ' -' '__')"
	echo "--- Disk before cleanup ---"
	df -h "${base}" \|\| true
	du -sh "${base}/worktrees" 2>/dev/null \|\| echo " (no worktrees dir)"
	echo " runner_slug=${runner_slug}"
	# 1. Drop only THIS runner's accumulated worktree dirs from prior
	# cells. Concurrent cells on other runners (different PIDs and
	# different RUNNER_NAME prefixes) are untouched.
	if [ -d "${base}/worktrees" ]; then
	find "${base}/worktrees" -mindepth 1 -maxdepth 1 -type d \
	-name "w${runner_slug}_*" -prune -exec rm -rf {} + 2>/dev/null \|\| true
	fi
	# 2. Prune dangling worktree registrations on every bare cache.
	# Idempotent and safe even when other runners have live
	# registrations: `git worktree prune` only drops entries whose
	# target directory is missing.
	for repo_cache in "${base}"/*/; do
	if [ -d "${repo_cache}/.git" ] \|\| [ -f "${repo_cache}/HEAD" ]; then
	git -C "${repo_cache}" worktree prune --expire=now 2>/dev/null \|\| true
	fi
	done
	# 3. Sanity check: a single cell on a large repo (angular, AutoGPT,
	# transformers) can grow worktrees to 50-100 GB. With 4 cells
	# running concurrently on the same physical disk we want at
	# least ~150 GB headroom (4 cells x ~35 GB peak typical, plus
	# swap and other infrastructure). Run #49 hit ENOSPC at ~310 GB
	# free at cell start, so 150 GB is the conservative floor.
	avail_kb=$(df --output=avail "${base}" \| tail -1 \| tr -d ' ')
	avail_gb=$((avail_kb / 1024 / 1024))
	echo "--- Disk after cleanup: ${avail_gb} GB available ---"
	df -h "${base}"
	if [ "${avail_gb}" -lt 150 ]; then
	msg="Less than 150 GB free on /cache/contextbench_repos after"
	msg="${msg} cleanup. Other runners may be holding live worktrees;"
	msg="${msg} retrying in 60s in case they are about to release."
	echo "::warning::${msg}" >&2
	sleep 60
	avail_kb=$(df --output=avail "${base}" \| tail -1 \| tr -d ' ')
	avail_gb=$((avail_kb / 1024 / 1024))
	echo "--- Disk after retry wait: ${avail_gb} GB ---"
	if [ "${avail_gb}" -lt 100 ]; then
	echo "::error::Disk still tight (${avail_gb} GB); aborting cell." >&2
	exit 1
	fi
	fi

	- name: Validate bake manifest and purge known-bad repos
	shell: bash
	run: \|
	set -euo pipefail
	MANIFEST=/cache/contextbench_repos/.bake_manifest.json
	if [ ! -f "${MANIFEST}" ]; then
	echo "WARN: no .bake_manifest.json found — proceeding without preflight purge"
	exit 0
	fi
	python3 - <<'PY'
	import json, shutil
	from pathlib import Path
	base = Path("/cache/contextbench_repos")
	manifest = json.loads((base / ".bake_manifest.json").read_text())
	purged = []
	for entry in manifest.get("failed_clones", []):
	repo = entry["repo"]
	cache_dir = base / repo.replace("/", "__")
	if cache_dir.exists():
	shutil.rmtree(cache_dir, ignore_errors=True)
	purged.append(repo)
	print(f"Purged {len(purged)} bake-failed repos: {purged[:10]}{'...' if len(purged) > 10 else ''}")
	fc = len(manifest.get("failed_clones", []))
	ff = len(manifest.get("failed_fetches", []))
	print(f"Bake manifest: failed_clones={fc} failed_fetches={ff}")
	PY

	- name: System info dump
	run: \|
	{
	echo "=== uname -a ==="; uname -a
	echo; echo "=== /proc/cpuinfo (1 core) ==="; awk '/^processor/{p++} p<2' /proc/cpuinfo
	echo; echo "=== /proc/meminfo (head) ==="; head -5 /proc/meminfo
	echo; echo "=== df -h ==="; df -h
	echo; echo "=== python ==="; python --version
	echo; echo "=== pip freeze (head 30) ==="; pip freeze 2>/dev/null \| head -30 \|\| true
	echo; echo "=== rustc ==="; rustc --version 2>/dev/null \|\| echo "no rustc"
	echo; echo "=== cargo ==="; cargo --version 2>/dev/null \|\| echo "no cargo"
	echo; echo "=== git ==="; git --version
	} > "${CELL_DIR}/system_info.log" 2>&1 \|\| true

	- name: Build per-cell single-test-set manifest dir
	run: \|
	mkdir -p /tmp/manifest_one
	cp "${MANIFESTS_DIR}/test_${{ matrix.test_set }}.txt" /tmp/manifest_one/

	- name: Resolve baseline + scoring
	id: cfg
	run: \|
	method="${{ matrix.method }}"
	case "$method" in
	ppr\|ego)
	echo "baseline=diffctx" >> "$GITHUB_OUTPUT"
	echo "scoring=$method" >> "$GITHUB_OUTPUT"
	;;
	bm25)
	echo "baseline=bm25" >> "$GITHUB_OUTPUT"
	echo "scoring=ego" >> "$GITHUB_OUTPUT"
	;;
	aider)
	echo "baseline=aider_fair" >> "$GITHUB_OUTPUT"
	echo "scoring=ego" >> "$GITHUB_OUTPUT"
	;;
	esac

	- name: Build winner.json
	run: \|
	cat > /tmp/winner.json <<JSON
	{
	"winner": {
	"tau": ${INPUT_TAU},
	"core_budget_fraction": ${INPUT_CBF},
	"budget": ${{ matrix.budget }},
	"scoring": "${SCORING}",
	"extra_env": {}
	}
	}
	JSON
	env:
	SCORING: ${{ steps.cfg.outputs.scoring }}

	- name: Install uv (required by aider baseline; safe no-op for others)
	run: pip install --no-cache-dir uv

	- name: Write checkpoint heartbeat script
	run: \|
	cat > /tmp/heartbeat.py << 'PYEOF'
	import os, sys, time, json, pathlib, urllib.request
	CKPT = pathlib.Path("${{ env.CELL_DIR }}/${{ matrix.test_set }}.checkpoint.jsonl")
	ARTIFACT_NAME = "heartbeat-${{ env.CELL_TAG }}"
	def try_upload():
	if not CKPT.exists():
	return
	rt_url = os.environ.get("ACTIONS_RUNTIME_URL", "")
	rt_tok = os.environ.get("ACTIONS_RUNTIME_TOKEN", "")
	run_id = os.environ.get("GITHUB_RUN_ID", "")
	if not (rt_url and rt_tok and run_id):
	return
	payload = json.dumps({"type": "actions_storage", "name": ARTIFACT_NAME}).encode()
	req = urllib.request.Request(
	f"{rt_url}_apis/pipelines/workflows/{run_id}/artifacts?api-version=6.0-preview",
	data=payload,
	headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/json"},
	method="POST",
	)
	try:
	with urllib.request.urlopen(req, timeout=30) as r:
	resp = json.loads(r.read())
	except Exception as e:
	print(f"[heartbeat] create: {e}", file=sys.stderr, flush=True)
	return
	container_url = resp.get("fileContainerResourceUrl", "")
	if not container_url:
	return
	blob = CKPT.read_bytes()
	put_req = urllib.request.Request(
	f"{container_url}?itemPath={ARTIFACT_NAME}/checkpoint.jsonl",
	data=blob,
	headers={"Authorization": f"Bearer {rt_tok}", "Content-Type": "application/octet-stream"},
	method="PUT",
	)
	try:
	urllib.request.urlopen(put_req, timeout=120)
	print(f"[heartbeat] uploaded {len(blob)} bytes", flush=True)
	except Exception as e:
	print(f"[heartbeat] put: {e}", file=sys.stderr, flush=True)
	while True:
	time.sleep(600)
	try:
	try_upload()
	except Exception as e:
	print(f"[heartbeat] loop: {e}", file=sys.stderr, flush=True)
	PYEOF

	- name: Run cell (logged to run.log)
	timeout-minutes: 330
	shell: bash
	run: \|
	set -o pipefail
	python3 /tmp/heartbeat.py >> "${CELL_DIR}/heartbeat.log" 2>&1 &
	HEARTBEAT_PID=$!
	# Apply EGO graph depth as an env override only when matrix.depth >= 0
	# (the sentinel -1 means "depth doesn't apply to this method"; in that
	# case we leave DIFFCTX_OP_GRAPH_DEPTH unset and the default kicks in).
	if [ "${MATRIX_DEPTH}" != "-1" ]; then
	export DIFFCTX_OP_GRAPH_DEPTH="${MATRIX_DEPTH}"
	fi
	python -m benchmarks.run_final_eval \
	--baseline "${BASELINE}" \
	--winner /tmp/winner.json \
	--manifests-dir /tmp/manifest_one \
	--workers 10 \
	--timeout-per-instance "${INPUT_TIMEOUT}" \
	--min-memory-gb 32 \
	--min-disk-gb 50 \
	--out "${CELL_DIR}" 2>&1 \| tee "${CELL_DIR}/run.log"
	kill "${HEARTBEAT_PID}" 2>/dev/null \|\| true
	env:
	BASELINE: ${{ steps.cfg.outputs.baseline }}
	MATRIX_DEPTH: ${{ matrix.depth }}

	- name: Per-instance metric summary (always)
	if: always()
	run: \|
	ckpt="${CELL_DIR}/${{ matrix.test_set }}.checkpoint.jsonl"
	if [ -f "${ckpt}" ]; then
	python - <<PY > "${CELL_DIR}/cell_summary.json"
	import json, statistics, sys
	rows = []
	with open("${ckpt}") as f:
	for line in f:
	line = line.strip()
	if line:
	rows.append(json.loads(line))
	n = len(rows)
	if n == 0:
	print(json.dumps({"n": 0}))
	else:
	recall = [r.get("file_recall", 0) for r in rows]
	precision = [r.get("file_precision", 0) for r in rows]
	elapsed = [r.get("elapsed_seconds", 0) for r in rows]
	tokens = [r.get("used_tokens", 0) for r in rows]
	# fragment_recall / line_f1 are populated only when the dataset
	# ships gold_fragments (ContextBench, PolyBench when CST nodes
	# exist); skip nulls so the aggregator gets honest means rather
	# than zero-padded ones.
	frag_recall = [r["fragment_recall"] for r in rows if r.get("fragment_recall") is not None]
	line_f1 = [r["line_f1"] for r in rows if r.get("line_f1") is not None]
	ok = sum(1 for r in rows if (r.get("extra") or {}).get("status") == "ok")
	statuses = {}
	errors = {}
	for r in rows:
	s = (r.get("extra") or {}).get("status", "missing")
	statuses[s] = statuses.get(s, 0) + 1
	if s not in ("ok",):
	err = str((r.get("extra") or {}).get("error", ""))
	if err:
	errors[err] = errors.get(err, 0) + 1
	out = {
	"n": n,
	"ok": ok,
	"ok_pct": 100.0 * ok / n if n else 0.0,
	"statuses": statuses,
	"errors": dict(sorted(errors.items(), key=lambda x: -x[1])[:10]),
	"file_recall": {
	"mean": statistics.fmean(recall),
	"median": statistics.median(recall),
	"p25": sorted(recall)[max(0, n*1//4-1)],
	"p75": sorted(recall)[min(n-1, n*3//4)],
	},
	"file_precision": {"mean": statistics.fmean(precision)},
	"fragment_recall": (
	{"mean": statistics.fmean(frag_recall), "n_with_gold": len(frag_recall)}
	if frag_recall else None
	),
	"line_f1": (
	{"mean": statistics.fmean(line_f1), "n_with_gold": len(line_f1)}
	if line_f1 else None
	),
	"elapsed_seconds": {
	"mean": statistics.fmean(elapsed),
	"max": max(elapsed),
	"total": sum(elapsed),
	},
	"used_tokens": {
	"mean": statistics.fmean(tokens),
	"max": max(tokens),
	},
	}
	print(json.dumps(out, indent=2))
	if errors:
	print("\n=== ERROR BREAKDOWN ===", file=sys.stderr)
	for msg, cnt in sorted(errors.items(), key=lambda x: -x[1]):
	print(f" [{cnt}x] {msg[:200]}", file=sys.stderr)
	PY
	cat "${CELL_DIR}/cell_summary.json"
	else
	printf '{"error": "no checkpoint produced", "expected_path": "%s"}\n' "${ckpt}" \
	> "${CELL_DIR}/cell_summary.json"
	ls -la "${CELL_DIR}" \|\| true
	fi

	- name: Append cell-end metadata
	if: always()
	run: \|
	end_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
	if command -v jq >/dev/null 2>&1; then
	tmp=$(mktemp)
	jq --arg end "${end_ts}" --arg status "${{ job.status }}" --arg disk "$(df -h / \| tail -1)" \
	'. + {ended_at_utc: $end, exit_status: $status, runner_disk_at_end: $disk}' \
	"${CELL_DIR}/metadata.json" > "${tmp}" \
	&& mv "${tmp}" "${CELL_DIR}/metadata.json"
	fi
	cat "${CELL_DIR}/metadata.json"

	- name: Upload cell artifact (90-day retention)
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: cell-${{ matrix.method }}-b${{ matrix.budget }}-L${{ matrix.depth }}-${{ matrix.test_set }}
	path: ${{ env.CELL_DIR }}/
	retention-days: 90
	if-no-files-found: warn

	- name: Reclaim disk — drop this cell's per-worker worktrees
	if: always()
	shell: bash
	run: \|
	set -e
	base=/cache/contextbench_repos
	runner_slug="$(printf '%s' "${RUNNER_NAME:-unknown}" \| tr ' -' '__')"
	# Defense-in-depth: clean up at the END of each cell so a crash
	# on the NEXT cell's preflight doesn't leave the runner ENOSPC'd.
	# Scoped to this runner only — the other 3 self-hosted runners
	# on this CCX63 share the disk and may have live worktrees.
	if [ -d "${base}/worktrees" ]; then
	find "${base}/worktrees" -mindepth 1 -maxdepth 1 -type d \
	-name "w${runner_slug}_*" -prune -exec rm -rf {} + 2>/dev/null \|\| true
	fi
	for repo_cache in "${base}"/*/; do
	if [ -d "${repo_cache}/.git" ] \|\| [ -f "${repo_cache}/HEAD" ]; then
	git -C "${repo_cache}" worktree prune --expire=now 2>/dev/null \|\| true
	fi
	done
	df -h "${base}" \|\| true

	# ============================================================================
	# 4. Aggregate all cells into per-cell summary table + raw artifact archive.
	# ============================================================================
	aggregate:
	needs: [prep, sweep-smoke, sweep-full]
	if: always()
	runs-on: ubuntu-latest
	permissions:
	contents: write
	steps:
	- uses: actions/checkout@v6

	- uses: actions/setup-python@v6
	with:
	python-version: '3.12'

	- name: Install deps
	run: pip install numpy scipy

	- name: Download all cell artifacts
	uses: actions/download-artifact@v8
	with:
	path: all_cells
	pattern: cell-*

	- name: Aggregate per-cell summaries
	run: \|
	mkdir -p aggregated
	python -m benchmarks.aggregate_sweep \
	--cells-dir all_cells \
	--sweep-id "${{ needs.prep.outputs.sweep_id }}" \
	--out aggregated/

	- name: Show aggregate summary
	run: \|
	echo "=== aggregated/ ==="
	ls -la aggregated/
	echo
	echo "=== aggregated/SWEEP_TABLE.md ==="
	cat aggregated/SWEEP_TABLE.md \|\| true
	echo
	echo "=== aggregated/grand_summary.json (head 80) ==="
	head -80 aggregated/grand_summary.json \|\| true

	- name: Upload aggregated tables (1-year retention)
	uses: actions/upload-artifact@v7
	with:
	name: sweep-aggregated-${{ needs.prep.outputs.sweep_id }}
	path: aggregated/
	retention-days: 365

	- name: Commit raw artifacts to results branch
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	BR: bench-results/sweep
	REPO: ${{ github.repository }}
	SWEEP_ID: ${{ needs.prep.outputs.sweep_id }}
	TS: ${{ needs.prep.outputs.ts }}
	INPUT_TAU: '0.12'
	INPUT_CBF: '0.5'
	INPUT_TIMEOUT: '600'
	INPUT_MANIFESTS_SUBDIR: v1
	run: \|
	git config user.name 'github-actions[bot]'
	git config user.email 'github-actions[bot]@users.noreply.github.com'
	git clone --depth 1 --branch main \
	"https://x-access-token:${GITHUB_TOKEN}@github.com/${REPO}.git" \
	sidecar
	cd sidecar
	if git ls-remote --heads origin "${BR}" \| grep -q "${BR}"; then
	git fetch origin "${BR}":"${BR}"
	git checkout "${BR}"
	else
	git checkout --orphan "${BR}"
	git rm -rf . 2>/dev/null \|\| true
	fi
	mkdir -p "${SWEEP_ID}"
	cp -r ../aggregated "${SWEEP_ID}/aggregated"
	cp -r ../all_cells "${SWEEP_ID}/cells"
	cat > "${SWEEP_ID}/README.md" <<EOF
	# Sweep ${SWEEP_ID}
	- Triggered: ${TS}
	- Source SHA: ${GITHUB_SHA}
	- Manifests: ${INPUT_MANIFESTS_SUBDIR}
	- tau=${INPUT_TAU}, core_budget_fraction=${INPUT_CBF}
	- Per-instance timeout: ${INPUT_TIMEOUT}s

	See \`aggregated/SWEEP_TABLE.md\` and \`cells/\` for raw per-instance JSONL checkpoints.
	EOF
	git add -A
	git commit -m "sweep ${SWEEP_ID} (source ${GITHUB_SHA})" \|\| echo "nothing to commit"
	for attempt in 1 2 3; do
	echo "Push attempt ${attempt}/3..."
	git pull --rebase origin "${BR}" 2>/dev/null \|\| true
	if git push -u origin "${BR}"; then
	break
	else
	sleep 10
	fi
	done

	# ============================================================================
	# 5. Deregister runners and destroy Hetzner server — full mode only
	# ============================================================================
	cleanup:
	needs: [sweep-full]
	if: always() && inputs.mode == 'full'
	runs-on: ubuntu-latest
	permissions:
	actions: write
	steps:
	- name: Deregister self-hosted runners
	env:
	GH_TOKEN: ${{ secrets.GH_PAT }}
	REPO: ${{ github.repository }}
	SERVER_NAME: bench-${{ github.run_id }}-${{ github.run_attempt }}
	run: \|
	set -euo pipefail
	RUNNER_IDS=$(curl -sf \
	-H "Authorization: Bearer ${GH_TOKEN}" \
	"https://api.github.com/repos/${REPO}/actions/runners?per_page=100" \
	\| jq --arg pfx "${SERVER_NAME}" \
	'[.runners[] \| select(.name \| startswith($pfx)) \| .id]')
	echo "Runner IDs to remove: ${RUNNER_IDS}"
	echo "${RUNNER_IDS}" \| jq -r '.[]' \| while read -r rid; do
	echo " Deleting runner ${rid}..."
	curl -sf -X DELETE \
	-H "Authorization: Bearer ${GH_TOKEN}" \
	"https://api.github.com/repos/${REPO}/actions/runners/${rid}" \|\| true
	done

	- name: Download server_id artifact
	uses: actions/download-artifact@v8
	with:
	name: hetzner-server-id
	path: /tmp/hetzner

	- name: Delete Hetzner server
	env:
	HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
	run: \|
	set -euo pipefail
	SERVER_ID=$(cat /tmp/hetzner/server_id.txt)
	if [ -z "${SERVER_ID}" ] \|\| [ "${SERVER_ID}" = "null" ]; then
	echo "No server ID found — nothing to delete."
	exit 0
	fi
	echo "Deleting Hetzner server ${SERVER_ID}..."
	curl -sf -X DELETE "https://api.hetzner.cloud/v1/servers/${SERVER_ID}" \
	-H "Authorization: Bearer ${HCLOUD_TOKEN}" \|\| true
	echo "Server ${SERVER_ID} deletion requested."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

bench-sweep #55

Workflow file

bench-sweep #55

Uh oh!

Workflow file for this run