Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,7 @@ __marimo__/

# testing parsed agent traces
output.txt

# PR-related local artifacts (not part of the codebase)
logs_harbor/
docs/pr-33-reopen.md
120 changes: 82 additions & 38 deletions src/harbor_adapter/adapter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
import json
import shutil
from dataclasses import dataclass
Expand Down Expand Up @@ -182,38 +183,23 @@ def generate_instruction(
target_path.write_text(content)

def generate_timer_sh(self, env_dir: Path) -> None:
"""Generate timer.sh script that tracks remaining time.
"""Copy the timer daemon and its entrypoint into the environment.

Uses a sentinel file to record the actual start time on first
invocation, so the timer is accurate even if the task is generated
long before the agent runs.
The timer is a long-running daemon started by entrypoint.sh at container
boot (and by bashrc/profile.d as a Modal-exec fallback). START_EPOCH is
captured when the daemon starts, so the clock tracks actual wall-clock
from sandbox creation — not task-generation time.

Both files are placed in env_dir so they are part of the Docker build
context; the Dockerfile copies them to /home/agent/ (outside the agent
workspace) so the agent-writable task folder does not contain the
authoritative copies.
"""
timer_script = f"""#!/bin/bash

NUM_HOURS={self.num_hours}

START_FILE="$(dirname "$0")/.timer_start"
if [ ! -f "$START_FILE" ]; then
date +%s > "$START_FILE"
fi
START_DATE=$(cat "$START_FILE")

DEADLINE=$((START_DATE + NUM_HOURS * 3600))
NOW=$(date +%s)
REMAINING=$((DEADLINE - NOW))

if [ $REMAINING -le 0 ]; then
echo "Timer expired!"
else
echo "Remaining time (hours:minutes)":
HOURS=$((REMAINING / 3600))
MINUTES=$(((REMAINING % 3600) / 60))
printf "%d:%02d\\n" $HOURS $MINUTES
fi
"""
timer_path = env_dir / "timer.sh"
timer_path.write_text(timer_script)
timer_path.chmod(0o755)
for name in ("timer.sh", "entrypoint.sh"):
src = TEMPLATE_DIR / "environment" / name
dst = env_dir / name
shutil.copy(src, dst)
dst.chmod(0o755)

def generate_environment(
self,
Expand All @@ -226,11 +212,15 @@ def generate_environment(
env_dir = task_dir / "environment"
env_dir.mkdir(parents=True, exist_ok=True)

# Copy Dockerfile template and .dockerignore
shutil.copy(
TEMPLATE_DIR / "environment" / "Dockerfile",
env_dir / "Dockerfile"
# Copy Dockerfile template and .dockerignore. The Dockerfile has a
# {task_budget_secs} placeholder that is filled from num_hours so the
# timer daemon (started at container boot) knows the budget.
dockerfile_src = TEMPLATE_DIR / "environment" / "Dockerfile"
dockerfile_content = dockerfile_src.read_text()
dockerfile_content = dockerfile_content.replace(
"{task_budget_secs}", str(self.num_hours * 3600)
)
(env_dir / "Dockerfile").write_text(dockerfile_content)
dockerignore_src = TEMPLATE_DIR / "environment" / ".dockerignore"
if dockerignore_src.exists():
shutil.copy(dockerignore_src, env_dir / ".dockerignore")
Expand Down Expand Up @@ -285,14 +275,26 @@ def generate_environment(
metadata_path.write_text(json.dumps(metadata, indent=2))

def generate_tests(self, task_dir: Path) -> None:
"""Generate the tests directory with verification script."""
"""Generate the tests directory with verification script.

Computes the SHA256 of evaluate.py at task-generation time and embeds
it into test.sh so the verifier can detect if the agent tampered with
the evaluation script (reward hacking mitigation).
"""
tests_dir = task_dir / "tests"
tests_dir.mkdir(parents=True, exist_ok=True)

# Copy test.sh
# Compute SHA256 of the evaluate.py that was copied into the environment
evaluate_py = task_dir / "environment" / "evaluate.py"
sha256 = hashlib.sha256(evaluate_py.read_bytes()).hexdigest()

# Read template, inject hash, and write
test_sh_src = TEMPLATE_DIR / "tests" / "test.sh"
content = test_sh_src.read_text()
content = content.replace("PLACEHOLDER_SHA256", sha256)

test_sh_dst = tests_dir / "test.sh"
shutil.copy(test_sh_src, test_sh_dst)
test_sh_dst.write_text(content)
test_sh_dst.chmod(0o755)

def generate_task(
Expand Down Expand Up @@ -339,11 +341,53 @@ def generate_task(
self.generate_task_toml(task_dir, benchmark_id)
self.generate_instruction(task_dir, model_info, benchmark_info, benchmark_id)
self.generate_environment(task_dir, benchmark_id, model_info, benchmark_info)
self.generate_tests(task_dir)
self.generate_tests(task_dir) # must come after generate_environment (needs evaluate.py)
self.generate_job_yaml(task_dir, benchmark_id, model_info)

print(f"Task generated at: {task_dir}")
return task_dir

def generate_job_yaml(self, task_dir: Path, benchmark_id: str, model_info: "ModelInfo") -> Path:
"""Generate a job.yaml for this task.

Run with: harbor run -c <task_dir>/job.yaml

Requires Harbor >= 0.2.0 (context_dir fix for Image.from_dockerfile
and 4 MB chunked parallel file transfers both landed in 0.2.0).
"""
task_name = task_dir.name
job_yaml = f"""\
# Harbor job configuration for {task_name}
# Run with: harbor run -c {task_dir}/job.yaml
# Requires Harbor >= 0.2.0.

jobs_dir: jobs
n_attempts: 1

environment:
type: modal

tasks:
- path: {task_dir}

agents:
- name: claude-code
model: anthropic/claude-sonnet-4-6

# Bulk artifact collection via Harbor's top-level `artifacts:` block. Includes
# /home/agent/.timer so the reviewer can confirm the timer daemon started at
# container boot. Add /home/agent/workspace to pull the full agent workspace
# (including model weights — can be multi-GB).
artifacts:
- /logs/verifier
- /logs/agent
- /home/agent/.timer
# - /home/agent/workspace
"""
job_yaml_path = task_dir / "job.yaml"
job_yaml_path.write_text(job_yaml)
return job_yaml_path

def generate_all_tasks(self) -> list[Path]:
"""Generate tasks for all benchmark + model combinations."""
tasks = []
Expand Down
51 changes: 41 additions & 10 deletions src/harbor_adapter/template/environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ ENV DEBIAN_FRONTEND=noninteractive
ENV PATH="/root/.local/bin:$PATH"
ENV NO_PROXY="localhost,127.0.0.1"
ENV no_proxy="localhost,127.0.0.1"
# Task budget in seconds (matches agent.timeout_sec in task.toml).
# Injected at task-generation time by adapter.py.
ENV TASK_BUDGET_SECS={task_budget_secs}

# Update and install system dependencies
RUN apt-get update && apt-get install -y \
Expand All @@ -27,8 +30,10 @@ RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \
# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | sh

# Install vllm
RUN uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto
# Install vllm. Repinned from 0.11.0 → 0.19.1: 0.11.0 required
# xformers==0.0.32.post1 which is no longer on PyPI for manylinux_x86_64,
# and 0.19.1 is what we successfully built and end-to-end tested on Modal.
RUN uv pip install --system --no-cache vllm==0.19.1 --torch-backend=auto

# Install AI CLI tools via npm (agents can use these)
RUN npm install -g \
Expand Down Expand Up @@ -62,20 +67,46 @@ RUN uv pip install --system --no-cache \
# Note: flash_attn requires GPU to compile - install at runtime if needed:
# pip install flash_attn --no-build-isolation

# Install inspect evals
RUN mkdir -p /opt && \
cd /opt && \
git clone --depth=1 https://github.com/UKGovernmentBEIS/inspect_evals.git && \
cd /opt/inspect_evals && \
uv pip install --system --no-cache .
# Install inspect_evals pinned to commit 03cb4bc2 (2026-03-15), the last
# commit on `main` that still declares `requires-python = ">=3.10"`. The
# next commit (b4d1356a, "Remove Python 3.10 from CI and bump minimum to
# 3.11") drops 3.10 support, which clashes with the python3.10 we install
# above. To bump past this pin, either upgrade the base image's Python or
# pick the latest inspect_evals commit that still supports 3.10.
RUN uv pip install --system --no-cache \
"inspect_evals @ git+https://github.com/UKGovernmentBEIS/inspect_evals.git@03cb4bc212890a828d9cc73c17e6c491ec24fbee"

# Setup workspace
RUN mkdir -p /home/agent/workspace
WORKDIR /home/agent/workspace

# Install timer daemon + entrypoint at /home/agent/ (outside the agent workspace).
# Authoritative copies live at /home/agent/; the agent-writable task folder
# does not have them.
COPY timer.sh /home/agent/timer.sh
COPY entrypoint.sh /home/agent/entrypoint.sh
RUN chmod +x /home/agent/timer.sh /home/agent/entrypoint.sh

# Timer daemon — three start mechanisms for Harbor/Modal reliability:
# 1. ENTRYPOINT — fires on container start under plain Docker.
# 2. /etc/bash.bashrc — fires on interactive bash exec (Modal's usual path).
# 3. BASH_ENV profile — fires on non-interactive bash exec.
# The timer is idempotent (PID file), so multiple triggers are safe.
RUN printf '# Start PostTrainBench task timer if not already running\n/home/agent/timer.sh >/dev/null 2>&1 &\n' \
>> /etc/bash.bashrc && \
printf '#!/usr/bin/env bash\n[ -x /home/agent/timer.sh ] && /home/agent/timer.sh >/dev/null 2>&1 &\n' \
> /etc/profile.d/ptb-timer.sh && \
chmod +x /etc/profile.d/ptb-timer.sh
ENV BASH_ENV=/etc/profile.d/ptb-timer.sh

# Copy all task files (populated by adapter; .dockerignore excludes Dockerfile)
COPY . /home/agent/workspace/

# Set permissions
# Set permissions on agent workspace. Remove any duplicate copies of timer/entrypoint
# that the wildcard COPY pulled in — the authoritative copies live at /home/agent/.
RUN chmod -R a+rw /home/agent/workspace/ && \
chmod +x /home/agent/workspace/timer.sh
rm -f /home/agent/workspace/timer.sh /home/agent/workspace/entrypoint.sh

ENTRYPOINT ["/home/agent/entrypoint.sh"]
# Keep the container alive when Harbor/Modal creates the sandbox without a command.
CMD ["tail", "-f", "/dev/null"]
13 changes: 13 additions & 0 deletions src/harbor_adapter/template/environment/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash
# entrypoint.sh — Container entrypoint. Starts the timer daemon in the
# background and then execs whatever command Harbor (or docker run) passes.

/home/agent/timer.sh >/dev/null 2>&1 &

# Harbor/Modal creates the sandbox first and execs commands into it later.
# If no explicit command is provided, keep the sandbox alive.
if [ "$#" -eq 0 ]; then
exec tail -f /dev/null
fi

exec "$@"
77 changes: 77 additions & 0 deletions src/harbor_adapter/template/environment/timer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env bash
# timer.sh — Background task-timer daemon.
#
# Started at container boot by entrypoint.sh (ENTRYPOINT), and again via
# /etc/bash.bashrc and /etc/profile.d on the first exec() call, so the
# daemon is running regardless of how Harbor/Modal launches the sandbox.
# Idempotent: a PID file prevents duplicate instances.
#
# Writes to /home/agent/.timer/:
# start_epoch — epoch seconds when timer started (at container boot)
# budget_secs — total budget (from TASK_BUDGET_SECS env)
# remaining_secs — refreshed every 10 seconds
# elapsed_secs — refreshed every 10 seconds
# alert_30min — created when ≤30 min remain
# alert_10min — created when ≤10 min remain
# alert_5min — created when ≤5 min remain
#
# Agent usage:
# cat /home/agent/.timer/remaining_secs
# test -f /home/agent/.timer/alert_30min

set -u

TIMER_DIR="/home/agent/.timer"
PID_FILE="$TIMER_DIR/timer.pid"

mkdir -p "$TIMER_DIR"

if [ -f "$PID_FILE" ]; then
EXISTING_PID=$(cat "$PID_FILE" 2>/dev/null)
if [ -n "$EXISTING_PID" ] && kill -0 "$EXISTING_PID" 2>/dev/null; then
exit 0
fi
fi

echo $$ > "$PID_FILE"

START_EPOCH=$(date +%s)
BUDGET_SECS="${TASK_BUDGET_SECS:-36000}"

echo "$START_EPOCH" > "$TIMER_DIR/start_epoch"
echo "$BUDGET_SECS" > "$TIMER_DIR/budget_secs"

while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_EPOCH))
REMAINING=$((BUDGET_SECS - ELAPSED))

if [ "$REMAINING" -lt 0 ]; then
REMAINING=0
fi

echo "$REMAINING" > "$TIMER_DIR/remaining_secs"
echo "$ELAPSED" > "$TIMER_DIR/elapsed_secs"

if [ "$REMAINING" -le 1800 ] && [ ! -f "$TIMER_DIR/alert_30min" ]; then
touch "$TIMER_DIR/alert_30min"
echo "[TIMER] 30 minutes remaining" >&2
fi

if [ "$REMAINING" -le 600 ] && [ ! -f "$TIMER_DIR/alert_10min" ]; then
touch "$TIMER_DIR/alert_10min"
echo "[TIMER] 10 minutes remaining" >&2
fi

if [ "$REMAINING" -le 300 ] && [ ! -f "$TIMER_DIR/alert_5min" ]; then
touch "$TIMER_DIR/alert_5min"
echo "[TIMER] 5 minutes remaining" >&2
fi

if [ "$REMAINING" -le 0 ]; then
echo "[TIMER] Time expired" >&2
break
fi

sleep 10
done
2 changes: 1 addition & 1 deletion src/harbor_adapter/template/instruction.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Store your best trained model in the folder \`final_model\`.
{setup_other}{openai_restriction}
## Rules
1. There will be no user interaction. You have to operate autonomously.
2. You have a maximum of {num_hours} hours. You can query how much time you have left by calling \`bash timer.sh\` in the current directory.
2. You have a maximum of {num_hours} hours. A background timer daemon writes remaining seconds to \`/home/agent/.timer/remaining_secs\` — read it with \`cat /home/agent/.timer/remaining_secs\`. Early-warning sentinel files \`/home/agent/.timer/alert_30min\`, \`alert_10min\`, and \`alert_5min\` are created as those thresholds are crossed.
3. Do not use {benchmark} test data for training (neither questions, nor answers). Otherwise we will have data contamination.
4. Do not modify evaluate.py or templates/ .
5. Work only in the current directory and subdirectories. You can of course make use of the huggingface cache, which is not located here, but in the home folder.
Expand Down
Loading