Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .beads/beads.db-shm
Binary file not shown.
Binary file modified .beads/beads.db-wal
Binary file not shown.
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"PR #35 merged (v0.4.0): full pipeline implemented — record-waa (interactive WAA API recording via VNC), annotate (VLM annotation of screenshots), eval (delegates to eval-suite). 12 harder tasks defined (0/12 zero-shot). CI workflow added. PR #36 merged (v0.4.1): fixed PyPI README images. Next: spin up Azure VM, record demos for 12 harder tasks, annotate, run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T02:00:07.491221-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"PR #35 merged (v0.4.0): full pipeline implemented — record-waa (interactive WAA API recording via VNC), annotate (VLM annotation of screenshots), eval (delegates to eval-suite). 12 harder tasks defined (0/12 zero-shot). CI workflow added. PR #36 merged (v0.4.1): fixed PyPI README images. Next: spin up Azure VM, record demos for 12 harder tasks, annotate, run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T10:42:20.670713-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
9 changes: 5 additions & 4 deletions openadapt_evals/benchmarks/vm_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,13 @@ def cmd_create(args):
return 1
log("CREATE", "SSH ready")

# Install Docker with /mnt storage
log("CREATE", "Installing Docker with /mnt storage...")
# Install Docker with persistent storage
log("CREATE", "Installing Docker with persistent storage...")
docker_setup = """
set -e
sudo apt-get update -qq
sudo apt-get install -y -qq docker.io
export DEBIAN_FRONTEND=noninteractive
sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq docker.io
sudo systemctl start docker
sudo systemctl enable docker
sudo usermod -aG docker $USER
Expand Down
2 changes: 2 additions & 0 deletions openadapt_evals/infrastructure/azure_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,8 @@ def _cli_create_vm(
"--generate-ssh-keys",
"--public-ip-sku",
"Standard",
"--os-disk-size-gb",
"128",
]
)

Expand Down
12 changes: 7 additions & 5 deletions openadapt_evals/infrastructure/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class PoolRunResult:
# Docker setup script for WAA workers
DOCKER_SETUP_SCRIPT = """
set -e
export DEBIAN_FRONTEND=noninteractive

# Wait for apt lock (unattended upgrades on fresh VMs)
echo "Waiting for apt lock..."
Expand All @@ -71,8 +72,8 @@ class PoolRunResult:
done
echo "Apt lock released"

sudo apt-get update -qq
sudo apt-get install -y -qq docker.io
sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq docker.io
sudo systemctl start docker
sudo systemctl enable docker
sudo usermod -aG docker $USER
Expand All @@ -97,6 +98,7 @@ class PoolRunResult:
# Docker setup script that pulls pre-built image from ACR instead of building
DOCKER_SETUP_SCRIPT_WITH_ACR = """
set -e
export DEBIAN_FRONTEND=noninteractive

# Wait for apt lock (unattended upgrades on fresh VMs)
echo "Waiting for apt lock..."
Expand All @@ -105,8 +107,8 @@ class PoolRunResult:
done
echo "Apt lock released"

sudo apt-get update -qq
sudo apt-get install -y -qq docker.io
sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq docker.io
sudo systemctl start docker
sudo systemctl enable docker
sudo usermod -aG docker $USER
Expand Down Expand Up @@ -160,7 +162,7 @@ class PoolRunResult:

# Set up socat proxy for evaluate server (Docker port forwarding doesn't work
# due to QEMU's custom bridge networking with --cap-add NET_ADMIN)
which socat >/dev/null 2>&1 || sudo apt-get install -y -qq socat
which socat >/dev/null 2>&1 || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat
killall socat 2>/dev/null || true
sleep 2
nohup socat TCP-LISTEN:5051,fork,reuseaddr EXEC:"docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050" > /dev/null 2>&1 &
Expand Down