From de9b4002f428c206c22191dfe949e207a4db7c7d Mon Sep 17 00:00:00 2001 From: Mersad Abbasi Date: Fri, 24 Apr 2026 17:13:04 -0700 Subject: [PATCH 1/2] Repin vllm and inspect_evals so the Dockerfile builds Two upstream-shifted dependencies were preventing the harbor_adapter Dockerfile from building on Modal: - vllm==0.11.0 requires xformers==0.0.32.post1, which is no longer on PyPI for manylinux_x86_64. Repinned to 0.19.1, which builds and runs end-to-end on Modal. - inspect_evals was cloned --depth=1 from main, but main HEAD now requires Python>=3.11 while the image installs python3.10. Switched to `uv pip install "inspect_evals @ git+...@"` pinned to commit 03cb4bc2 (2026-03-15), the last commit on main still declaring requires-python = ">=3.10". Also removes the manual git clone step. Also adds local-only artifacts to .gitignore so they don't sneak in. --- .gitignore | 4 ++++ .../template/environment/Dockerfile | 17 +++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 6962ef6..745ddf7 100644 --- a/.gitignore +++ b/.gitignore @@ -231,3 +231,7 @@ __marimo__/ # testing parsed agent traces output.txt + +# PR-related local artifacts (not part of the codebase) +logs_harbor/ +docs/pr-33-reopen.md diff --git a/src/harbor_adapter/template/environment/Dockerfile b/src/harbor_adapter/template/environment/Dockerfile index 216aebd..8864ffc 100644 --- a/src/harbor_adapter/template/environment/Dockerfile +++ b/src/harbor_adapter/template/environment/Dockerfile @@ -27,8 +27,9 @@ RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \ # Install uv RUN curl -LsSf https://astral.sh/uv/install.sh | sh -# Install vllm -RUN uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto +# Install vllm. Repinned from 0.11.0 → 0.19.1: 0.11.0 required +# xformers==0.0.32.post1 which is no longer on PyPI for manylinux_x86_64. +RUN uv pip install --system --no-cache vllm==0.19.1 --torch-backend=auto # Install AI CLI tools via npm (agents can use these) RUN npm install -g \ @@ -62,12 +63,12 @@ RUN uv pip install --system --no-cache \ # Note: flash_attn requires GPU to compile - install at runtime if needed: # pip install flash_attn --no-build-isolation -# Install inspect evals -RUN mkdir -p /opt && \ - cd /opt && \ - git clone --depth=1 https://github.com/UKGovernmentBEIS/inspect_evals.git && \ - cd /opt/inspect_evals && \ - uv pip install --system --no-cache . +# Install inspect_evals pinned to commit 03cb4bc2 (2026-03-15), the last +# commit on `main` that still declares `requires-python = ">=3.10"`. The +# next commit (b4d1356a) bumps to >=3.11, which clashes with the +# python3.10 we install above. +RUN uv pip install --system --no-cache \ + "inspect_evals @ git+https://github.com/UKGovernmentBEIS/inspect_evals.git@03cb4bc212890a828d9cc73c17e6c491ec24fbee" # Setup workspace RUN mkdir -p /home/agent/workspace From 45c6a188ac276c672988c777a900f94b26f121b2 Mon Sep 17 00:00:00 2001 From: Mersad Abbasi Date: Fri, 24 Apr 2026 17:22:30 -0700 Subject: [PATCH 2/2] Close Harbor integration gaps: timer daemon, verifier isolation, artifact collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #8. Replaces #33 with hrdkbhatnagar's review feedback addressed end-to-end on Modal. - Pre-agent timer is a real daemon, not a sentinel-file. Started at container boot via three redundant mechanisms (ENTRYPOINT, /etc/bash.bashrc, BASH_ENV profile script), all idempotent via a PID file. start_epoch is captured by the daemon's first iteration before any agent code runs. - tests/test.sh restructured around a single fail() helper that always writes reward.txt and metrics.json and snapshots the timer daemon's state into /logs/verifier/timer/, so reviewer-visible evidence lands on every exit path — not just the success path. - Artifact collection moved out of test.sh and onto Harbor's top-level artifacts: config in the generated job.yaml. Fixes a bug from #33 where the in-script find/cp loop was unreachable on early exits (e.g. missing final_model) — exactly when artifacts are most useful. - Anti-tamper shebang check: test.sh verifies timer.sh and entrypoint.sh still start with their expected shebangs. - evaluate.py SHA256 integrity check: hash injected into test.sh at task-generation time; mismatch -> reward 0. - Requires Harbor >= 0.2.0 (context_dir fix and 4 MB chunked parallel file transfers both landed in 0.2.0). --- src/harbor_adapter/adapter.py | 120 ++++++++++++------ .../template/environment/Dockerfile | 40 +++++- .../template/environment/entrypoint.sh | 13 ++ .../template/environment/timer.sh | 77 +++++++++++ src/harbor_adapter/template/instruction.md | 2 +- src/harbor_adapter/template/tests/test.sh | 78 ++++++++++-- 6 files changed, 277 insertions(+), 53 deletions(-) create mode 100755 src/harbor_adapter/template/environment/entrypoint.sh create mode 100755 src/harbor_adapter/template/environment/timer.sh diff --git a/src/harbor_adapter/adapter.py b/src/harbor_adapter/adapter.py index 44e3ec2..6e9e18e 100644 --- a/src/harbor_adapter/adapter.py +++ b/src/harbor_adapter/adapter.py @@ -1,3 +1,4 @@ +import hashlib import json import shutil from dataclasses import dataclass @@ -182,38 +183,23 @@ def generate_instruction( target_path.write_text(content) def generate_timer_sh(self, env_dir: Path) -> None: - """Generate timer.sh script that tracks remaining time. + """Copy the timer daemon and its entrypoint into the environment. - Uses a sentinel file to record the actual start time on first - invocation, so the timer is accurate even if the task is generated - long before the agent runs. + The timer is a long-running daemon started by entrypoint.sh at container + boot (and by bashrc/profile.d as a Modal-exec fallback). START_EPOCH is + captured when the daemon starts, so the clock tracks actual wall-clock + from sandbox creation — not task-generation time. + + Both files are placed in env_dir so they are part of the Docker build + context; the Dockerfile copies them to /home/agent/ (outside the agent + workspace) so the agent-writable task folder does not contain the + authoritative copies. """ - timer_script = f"""#!/bin/bash - -NUM_HOURS={self.num_hours} - -START_FILE="$(dirname "$0")/.timer_start" -if [ ! -f "$START_FILE" ]; then - date +%s > "$START_FILE" -fi -START_DATE=$(cat "$START_FILE") - -DEADLINE=$((START_DATE + NUM_HOURS * 3600)) -NOW=$(date +%s) -REMAINING=$((DEADLINE - NOW)) - -if [ $REMAINING -le 0 ]; then - echo "Timer expired!" -else - echo "Remaining time (hours:minutes)": - HOURS=$((REMAINING / 3600)) - MINUTES=$(((REMAINING % 3600) / 60)) - printf "%d:%02d\\n" $HOURS $MINUTES -fi -""" - timer_path = env_dir / "timer.sh" - timer_path.write_text(timer_script) - timer_path.chmod(0o755) + for name in ("timer.sh", "entrypoint.sh"): + src = TEMPLATE_DIR / "environment" / name + dst = env_dir / name + shutil.copy(src, dst) + dst.chmod(0o755) def generate_environment( self, @@ -226,11 +212,15 @@ def generate_environment( env_dir = task_dir / "environment" env_dir.mkdir(parents=True, exist_ok=True) - # Copy Dockerfile template and .dockerignore - shutil.copy( - TEMPLATE_DIR / "environment" / "Dockerfile", - env_dir / "Dockerfile" + # Copy Dockerfile template and .dockerignore. The Dockerfile has a + # {task_budget_secs} placeholder that is filled from num_hours so the + # timer daemon (started at container boot) knows the budget. + dockerfile_src = TEMPLATE_DIR / "environment" / "Dockerfile" + dockerfile_content = dockerfile_src.read_text() + dockerfile_content = dockerfile_content.replace( + "{task_budget_secs}", str(self.num_hours * 3600) ) + (env_dir / "Dockerfile").write_text(dockerfile_content) dockerignore_src = TEMPLATE_DIR / "environment" / ".dockerignore" if dockerignore_src.exists(): shutil.copy(dockerignore_src, env_dir / ".dockerignore") @@ -285,14 +275,26 @@ def generate_environment( metadata_path.write_text(json.dumps(metadata, indent=2)) def generate_tests(self, task_dir: Path) -> None: - """Generate the tests directory with verification script.""" + """Generate the tests directory with verification script. + + Computes the SHA256 of evaluate.py at task-generation time and embeds + it into test.sh so the verifier can detect if the agent tampered with + the evaluation script (reward hacking mitigation). + """ tests_dir = task_dir / "tests" tests_dir.mkdir(parents=True, exist_ok=True) - # Copy test.sh + # Compute SHA256 of the evaluate.py that was copied into the environment + evaluate_py = task_dir / "environment" / "evaluate.py" + sha256 = hashlib.sha256(evaluate_py.read_bytes()).hexdigest() + + # Read template, inject hash, and write test_sh_src = TEMPLATE_DIR / "tests" / "test.sh" + content = test_sh_src.read_text() + content = content.replace("PLACEHOLDER_SHA256", sha256) + test_sh_dst = tests_dir / "test.sh" - shutil.copy(test_sh_src, test_sh_dst) + test_sh_dst.write_text(content) test_sh_dst.chmod(0o755) def generate_task( @@ -339,11 +341,53 @@ def generate_task( self.generate_task_toml(task_dir, benchmark_id) self.generate_instruction(task_dir, model_info, benchmark_info, benchmark_id) self.generate_environment(task_dir, benchmark_id, model_info, benchmark_info) - self.generate_tests(task_dir) + self.generate_tests(task_dir) # must come after generate_environment (needs evaluate.py) + self.generate_job_yaml(task_dir, benchmark_id, model_info) print(f"Task generated at: {task_dir}") return task_dir + def generate_job_yaml(self, task_dir: Path, benchmark_id: str, model_info: "ModelInfo") -> Path: + """Generate a job.yaml for this task. + + Run with: harbor run -c /job.yaml + + Requires Harbor >= 0.2.0 (context_dir fix for Image.from_dockerfile + and 4 MB chunked parallel file transfers both landed in 0.2.0). + """ + task_name = task_dir.name + job_yaml = f"""\ +# Harbor job configuration for {task_name} +# Run with: harbor run -c {task_dir}/job.yaml +# Requires Harbor >= 0.2.0. + +jobs_dir: jobs +n_attempts: 1 + +environment: + type: modal + +tasks: + - path: {task_dir} + +agents: + - name: claude-code + model: anthropic/claude-sonnet-4-6 + +# Bulk artifact collection via Harbor's top-level `artifacts:` block. Includes +# /home/agent/.timer so the reviewer can confirm the timer daemon started at +# container boot. Add /home/agent/workspace to pull the full agent workspace +# (including model weights — can be multi-GB). +artifacts: + - /logs/verifier + - /logs/agent + - /home/agent/.timer + # - /home/agent/workspace +""" + job_yaml_path = task_dir / "job.yaml" + job_yaml_path.write_text(job_yaml) + return job_yaml_path + def generate_all_tasks(self) -> list[Path]: """Generate tasks for all benchmark + model combinations.""" tasks = [] diff --git a/src/harbor_adapter/template/environment/Dockerfile b/src/harbor_adapter/template/environment/Dockerfile index 8864ffc..5a12af2 100644 --- a/src/harbor_adapter/template/environment/Dockerfile +++ b/src/harbor_adapter/template/environment/Dockerfile @@ -4,6 +4,9 @@ ENV DEBIAN_FRONTEND=noninteractive ENV PATH="/root/.local/bin:$PATH" ENV NO_PROXY="localhost,127.0.0.1" ENV no_proxy="localhost,127.0.0.1" +# Task budget in seconds (matches agent.timeout_sec in task.toml). +# Injected at task-generation time by adapter.py. +ENV TASK_BUDGET_SECS={task_budget_secs} # Update and install system dependencies RUN apt-get update && apt-get install -y \ @@ -28,7 +31,8 @@ RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \ RUN curl -LsSf https://astral.sh/uv/install.sh | sh # Install vllm. Repinned from 0.11.0 → 0.19.1: 0.11.0 required -# xformers==0.0.32.post1 which is no longer on PyPI for manylinux_x86_64. +# xformers==0.0.32.post1 which is no longer on PyPI for manylinux_x86_64, +# and 0.19.1 is what we successfully built and end-to-end tested on Modal. RUN uv pip install --system --no-cache vllm==0.19.1 --torch-backend=auto # Install AI CLI tools via npm (agents can use these) @@ -65,8 +69,10 @@ RUN uv pip install --system --no-cache \ # Install inspect_evals pinned to commit 03cb4bc2 (2026-03-15), the last # commit on `main` that still declares `requires-python = ">=3.10"`. The -# next commit (b4d1356a) bumps to >=3.11, which clashes with the -# python3.10 we install above. +# next commit (b4d1356a, "Remove Python 3.10 from CI and bump minimum to +# 3.11") drops 3.10 support, which clashes with the python3.10 we install +# above. To bump past this pin, either upgrade the base image's Python or +# pick the latest inspect_evals commit that still supports 3.10. RUN uv pip install --system --no-cache \ "inspect_evals @ git+https://github.com/UKGovernmentBEIS/inspect_evals.git@03cb4bc212890a828d9cc73c17e6c491ec24fbee" @@ -74,9 +80,33 @@ RUN uv pip install --system --no-cache \ RUN mkdir -p /home/agent/workspace WORKDIR /home/agent/workspace +# Install timer daemon + entrypoint at /home/agent/ (outside the agent workspace). +# Authoritative copies live at /home/agent/; the agent-writable task folder +# does not have them. +COPY timer.sh /home/agent/timer.sh +COPY entrypoint.sh /home/agent/entrypoint.sh +RUN chmod +x /home/agent/timer.sh /home/agent/entrypoint.sh + +# Timer daemon — three start mechanisms for Harbor/Modal reliability: +# 1. ENTRYPOINT — fires on container start under plain Docker. +# 2. /etc/bash.bashrc — fires on interactive bash exec (Modal's usual path). +# 3. BASH_ENV profile — fires on non-interactive bash exec. +# The timer is idempotent (PID file), so multiple triggers are safe. +RUN printf '# Start PostTrainBench task timer if not already running\n/home/agent/timer.sh >/dev/null 2>&1 &\n' \ + >> /etc/bash.bashrc && \ + printf '#!/usr/bin/env bash\n[ -x /home/agent/timer.sh ] && /home/agent/timer.sh >/dev/null 2>&1 &\n' \ + > /etc/profile.d/ptb-timer.sh && \ + chmod +x /etc/profile.d/ptb-timer.sh +ENV BASH_ENV=/etc/profile.d/ptb-timer.sh + # Copy all task files (populated by adapter; .dockerignore excludes Dockerfile) COPY . /home/agent/workspace/ -# Set permissions +# Set permissions on agent workspace. Remove any duplicate copies of timer/entrypoint +# that the wildcard COPY pulled in — the authoritative copies live at /home/agent/. RUN chmod -R a+rw /home/agent/workspace/ && \ - chmod +x /home/agent/workspace/timer.sh + rm -f /home/agent/workspace/timer.sh /home/agent/workspace/entrypoint.sh + +ENTRYPOINT ["/home/agent/entrypoint.sh"] +# Keep the container alive when Harbor/Modal creates the sandbox without a command. +CMD ["tail", "-f", "/dev/null"] diff --git a/src/harbor_adapter/template/environment/entrypoint.sh b/src/harbor_adapter/template/environment/entrypoint.sh new file mode 100755 index 0000000..5c4cbb1 --- /dev/null +++ b/src/harbor_adapter/template/environment/entrypoint.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# entrypoint.sh — Container entrypoint. Starts the timer daemon in the +# background and then execs whatever command Harbor (or docker run) passes. + +/home/agent/timer.sh >/dev/null 2>&1 & + +# Harbor/Modal creates the sandbox first and execs commands into it later. +# If no explicit command is provided, keep the sandbox alive. +if [ "$#" -eq 0 ]; then + exec tail -f /dev/null +fi + +exec "$@" diff --git a/src/harbor_adapter/template/environment/timer.sh b/src/harbor_adapter/template/environment/timer.sh new file mode 100755 index 0000000..89ba63a --- /dev/null +++ b/src/harbor_adapter/template/environment/timer.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# timer.sh — Background task-timer daemon. +# +# Started at container boot by entrypoint.sh (ENTRYPOINT), and again via +# /etc/bash.bashrc and /etc/profile.d on the first exec() call, so the +# daemon is running regardless of how Harbor/Modal launches the sandbox. +# Idempotent: a PID file prevents duplicate instances. +# +# Writes to /home/agent/.timer/: +# start_epoch — epoch seconds when timer started (at container boot) +# budget_secs — total budget (from TASK_BUDGET_SECS env) +# remaining_secs — refreshed every 10 seconds +# elapsed_secs — refreshed every 10 seconds +# alert_30min — created when ≤30 min remain +# alert_10min — created when ≤10 min remain +# alert_5min — created when ≤5 min remain +# +# Agent usage: +# cat /home/agent/.timer/remaining_secs +# test -f /home/agent/.timer/alert_30min + +set -u + +TIMER_DIR="/home/agent/.timer" +PID_FILE="$TIMER_DIR/timer.pid" + +mkdir -p "$TIMER_DIR" + +if [ -f "$PID_FILE" ]; then + EXISTING_PID=$(cat "$PID_FILE" 2>/dev/null) + if [ -n "$EXISTING_PID" ] && kill -0 "$EXISTING_PID" 2>/dev/null; then + exit 0 + fi +fi + +echo $$ > "$PID_FILE" + +START_EPOCH=$(date +%s) +BUDGET_SECS="${TASK_BUDGET_SECS:-36000}" + +echo "$START_EPOCH" > "$TIMER_DIR/start_epoch" +echo "$BUDGET_SECS" > "$TIMER_DIR/budget_secs" + +while true; do + NOW=$(date +%s) + ELAPSED=$((NOW - START_EPOCH)) + REMAINING=$((BUDGET_SECS - ELAPSED)) + + if [ "$REMAINING" -lt 0 ]; then + REMAINING=0 + fi + + echo "$REMAINING" > "$TIMER_DIR/remaining_secs" + echo "$ELAPSED" > "$TIMER_DIR/elapsed_secs" + + if [ "$REMAINING" -le 1800 ] && [ ! -f "$TIMER_DIR/alert_30min" ]; then + touch "$TIMER_DIR/alert_30min" + echo "[TIMER] 30 minutes remaining" >&2 + fi + + if [ "$REMAINING" -le 600 ] && [ ! -f "$TIMER_DIR/alert_10min" ]; then + touch "$TIMER_DIR/alert_10min" + echo "[TIMER] 10 minutes remaining" >&2 + fi + + if [ "$REMAINING" -le 300 ] && [ ! -f "$TIMER_DIR/alert_5min" ]; then + touch "$TIMER_DIR/alert_5min" + echo "[TIMER] 5 minutes remaining" >&2 + fi + + if [ "$REMAINING" -le 0 ]; then + echo "[TIMER] Time expired" >&2 + break + fi + + sleep 10 +done diff --git a/src/harbor_adapter/template/instruction.md b/src/harbor_adapter/template/instruction.md index dd7a3f1..9a53a97 100644 --- a/src/harbor_adapter/template/instruction.md +++ b/src/harbor_adapter/template/instruction.md @@ -20,7 +20,7 @@ Store your best trained model in the folder \`final_model\`. {setup_other}{openai_restriction} ## Rules 1. There will be no user interaction. You have to operate autonomously. -2. You have a maximum of {num_hours} hours. You can query how much time you have left by calling \`bash timer.sh\` in the current directory. +2. You have a maximum of {num_hours} hours. A background timer daemon writes remaining seconds to \`/home/agent/.timer/remaining_secs\` — read it with \`cat /home/agent/.timer/remaining_secs\`. Early-warning sentinel files \`/home/agent/.timer/alert_30min\`, \`alert_10min\`, and \`alert_5min\` are created as those thresholds are crossed. 3. Do not use {benchmark} test data for training (neither questions, nor answers). Otherwise we will have data contamination. 4. Do not modify evaluate.py or templates/ . 5. Work only in the current directory and subdirectories. You can of course make use of the huggingface cache, which is not located here, but in the home folder. diff --git a/src/harbor_adapter/template/tests/test.sh b/src/harbor_adapter/template/tests/test.sh index 76d4371..4efa78d 100644 --- a/src/harbor_adapter/template/tests/test.sh +++ b/src/harbor_adapter/template/tests/test.sh @@ -3,13 +3,77 @@ set -e # PostTrainBench verification script # Runs contamination judge (codex CLI) and 3-phase evaluation with retry logic. -# Matches the original run_task.sh evaluation pipeline. + +# SHA256 of evaluate.py at task-generation time (injected by adapter.py). +# If the agent tampered with evaluate.py this check catches it. +EVALUATE_PY_SHA256="PLACEHOLDER_SHA256" WORKSPACE="/home/agent/workspace" LOGS_DIR="/logs/verifier" +TIMER_DIR="/home/agent/.timer" mkdir -p "$LOGS_DIR" +# Centralized failure path: record a reason, write reward 0, include the +# timer daemon state snapshot so the reviewer can confirm the clock was +# running at container-boot time. Exits 0 so Harbor scores the trial. +fail() { + local reason="$1" + echo "FAIL: $reason" + python3 -c "import json, sys; json.dump({'error': sys.argv[1], 'accuracy': 0}, open(sys.argv[2], 'w'))" \ + "$reason" "$LOGS_DIR/metrics.json" + echo "0" > "$LOGS_DIR/reward.txt" + # Snapshot timer state for post-mortem. Never fails. + if [ -d "$TIMER_DIR" ]; then + mkdir -p "$LOGS_DIR/timer" + cp -a "$TIMER_DIR/." "$LOGS_DIR/timer/" 2>/dev/null || true + fi + exit 0 +} + +# ============================================================ +# Anti-cheat 1: evaluate.py integrity +# ============================================================ +echo "=== Verifying evaluate.py integrity ===" +ACTUAL_SHA=$(sha256sum "$WORKSPACE/evaluate.py" 2>/dev/null | awk '{print $1}') +if [ "$ACTUAL_SHA" != "$EVALUATE_PY_SHA256" ]; then + echo " expected: $EVALUATE_PY_SHA256" + echo " actual: $ACTUAL_SHA" + fail "evaluate.py tampered with by agent (sha256 mismatch)" +fi +echo "OK: evaluate.py integrity verified ($EVALUATE_PY_SHA256)" + +# ============================================================ +# Anti-cheat 2: timer/entrypoint shebang intact +# The root agent can't stop us from reading /home/agent/timer.sh, but a +# sloppy tamper (e.g. rewriting them to no-ops) shows up as a broken +# shebang. Belt-and-suspenders for the timer daemon guarantee. +# ============================================================ +for immutable in /home/agent/timer.sh /home/agent/entrypoint.sh; do + if [ -f "$immutable" ]; then + first_line="$(head -1 "$immutable" 2>/dev/null || true)" + case "$first_line" in + '#!/usr/bin/env bash'|'#!/bin/bash') + ;; + *) + fail "$(basename "$immutable") was tampered with (shebang mismatch)" + ;; + esac + fi +done +echo "OK: timer.sh and entrypoint.sh shebangs intact" + +# ============================================================ +# Snapshot timer daemon state (for PR-evidence / debug) +# ============================================================ +if [ -d "$TIMER_DIR" ]; then + mkdir -p "$LOGS_DIR/timer" + cp -a "$TIMER_DIR/." "$LOGS_DIR/timer/" 2>/dev/null || true + echo "OK: timer state snapshotted" +else + echo "WARN: $TIMER_DIR does not exist — timer daemon may not have run" +fi + echo "=== PostTrainBench Verifier ===" echo "Workspace: $WORKSPACE" echo "Logs dir: $LOGS_DIR" @@ -23,11 +87,8 @@ nvidia-smi 2>&1 | tee "$LOGS_DIR/gpu_check.txt" || echo "nvidia-smi failed" echo "" echo "=== Checking final_model ===" if [ ! -d "$WORKSPACE/final_model" ]; then - echo "ERROR: final_model directory not found" ls -la "$WORKSPACE" > "$LOGS_DIR/workspace_listing.txt" 2>&1 - echo '{"error": "final_model not found", "accuracy": 0}' > "$LOGS_DIR/metrics.json" - echo "0" > "$LOGS_DIR/reward.txt" - exit 0 + fail "final_model directory not found" fi # Check if final_model has required files @@ -35,10 +96,7 @@ echo "Contents of final_model:" ls -la "$WORKSPACE/final_model" | tee "$LOGS_DIR/final_model_listing.txt" if [ ! -f "$WORKSPACE/final_model/config.json" ]; then - echo "ERROR: final_model/config.json not found - not a valid model" - echo '{"error": "invalid model - no config.json", "accuracy": 0}' > "$LOGS_DIR/metrics.json" - echo "0" > "$LOGS_DIR/reward.txt" - exit 0 + fail "final_model/config.json missing — not a valid model" fi # Show model config @@ -274,3 +332,5 @@ echo "" echo "=== Verification complete ===" echo "Results in $LOGS_DIR/" ls -la "$LOGS_DIR/" +# Full-workspace artifact collection is handled by Harbor's top-level +# artifacts: config in job.yaml, not by this script.