diff --git a/.beads/beads.db-shm b/.beads/beads.db-shm index fe9ac28..3e00bde 100644 Binary files a/.beads/beads.db-shm and b/.beads/beads.db-shm differ diff --git a/.beads/beads.db-wal b/.beads/beads.db-wal index e69de29..36d3f3e 100644 Binary files a/.beads/beads.db-wal and b/.beads/beads.db-wal differ diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 1cd2c45..f07a455 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -13,5 +13,5 @@ {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"} {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"} {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"} -{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"PR #35 merged (v0.4.0): full pipeline implemented — record-waa (interactive WAA API recording via VNC), annotate (VLM annotation of screenshots), eval (delegates to eval-suite). 12 harder tasks defined (0/12 zero-shot). CI workflow added. PR #36 merged (v0.4.1): fixed PyPI README images. Next: spin up Azure VM, record demos for 12 harder tasks, annotate, run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T02:00:07.491221-05:00"} +{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"PR #35 merged (v0.4.0): full pipeline implemented — record-waa (interactive WAA API recording via VNC), annotate (VLM annotation of screenshots), eval (delegates to eval-suite). 12 harder tasks defined (0/12 zero-shot). CI workflow added. PR #36 merged (v0.4.1): fixed PyPI README images. Next: spin up Azure VM, record demos for 12 harder tasks, annotate, run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T10:42:20.670713-05:00"} {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"} diff --git a/openadapt_evals/benchmarks/vm_cli.py b/openadapt_evals/benchmarks/vm_cli.py index 122dc31..79b50ed 100644 --- a/openadapt_evals/benchmarks/vm_cli.py +++ b/openadapt_evals/benchmarks/vm_cli.py @@ -339,12 +339,13 @@ def cmd_create(args): return 1 log("CREATE", "SSH ready") - # Install Docker with /mnt storage - log("CREATE", "Installing Docker with /mnt storage...") + # Install Docker with persistent storage + log("CREATE", "Installing Docker with persistent storage...") docker_setup = """ set -e -sudo apt-get update -qq -sudo apt-get install -y -qq docker.io +export DEBIAN_FRONTEND=noninteractive +sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq docker.io sudo systemctl start docker sudo systemctl enable docker sudo usermod -aG docker $USER diff --git a/openadapt_evals/infrastructure/azure_vm.py b/openadapt_evals/infrastructure/azure_vm.py index 3608b3a..73ef5b9 100644 --- a/openadapt_evals/infrastructure/azure_vm.py +++ b/openadapt_evals/infrastructure/azure_vm.py @@ -836,6 +836,8 @@ def _cli_create_vm( "--generate-ssh-keys", "--public-ip-sku", "Standard", + "--os-disk-size-gb", + "128", ] ) diff --git a/openadapt_evals/infrastructure/pool.py b/openadapt_evals/infrastructure/pool.py index 6dcd8bb..d64f055 100644 --- a/openadapt_evals/infrastructure/pool.py +++ b/openadapt_evals/infrastructure/pool.py @@ -63,6 +63,7 @@ class PoolRunResult: # Docker setup script for WAA workers DOCKER_SETUP_SCRIPT = """ set -e +export DEBIAN_FRONTEND=noninteractive # Wait for apt lock (unattended upgrades on fresh VMs) echo "Waiting for apt lock..." @@ -71,8 +72,8 @@ class PoolRunResult: done echo "Apt lock released" -sudo apt-get update -qq -sudo apt-get install -y -qq docker.io +sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq docker.io sudo systemctl start docker sudo systemctl enable docker sudo usermod -aG docker $USER @@ -97,6 +98,7 @@ class PoolRunResult: # Docker setup script that pulls pre-built image from ACR instead of building DOCKER_SETUP_SCRIPT_WITH_ACR = """ set -e +export DEBIAN_FRONTEND=noninteractive # Wait for apt lock (unattended upgrades on fresh VMs) echo "Waiting for apt lock..." @@ -105,8 +107,8 @@ class PoolRunResult: done echo "Apt lock released" -sudo apt-get update -qq -sudo apt-get install -y -qq docker.io +sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq docker.io sudo systemctl start docker sudo systemctl enable docker sudo usermod -aG docker $USER @@ -160,7 +162,7 @@ class PoolRunResult: # Set up socat proxy for evaluate server (Docker port forwarding doesn't work # due to QEMU's custom bridge networking with --cap-add NET_ADMIN) -which socat >/dev/null 2>&1 || sudo apt-get install -y -qq socat +which socat >/dev/null 2>&1 || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat killall socat 2>/dev/null || true sleep 2 nohup socat TCP-LISTEN:5051,fork,reuseaddr EXEC:"docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050" > /dev/null 2>&1 &