Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Pipeline complete. 3 annotated demos produced. Need Azure VM to run eval. Anthropic credits depleted — use OpenAI.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-18T00:03:34.77925-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"PR #35 merged (v0.4.0): full pipeline implemented — record-waa (interactive WAA API recording via VNC), annotate (VLM annotation of screenshots), eval (delegates to eval-suite). 12 harder tasks defined (0/12 zero-shot). CI workflow added. PR #36 merged (v0.4.1): fixed PyPI README images. Next: spin up Azure VM, record demos for 12 harder tasks, annotate, run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T02:00:07.491221-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
9 changes: 5 additions & 4 deletions openadapt_evals/benchmarks/vm_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,16 @@ def cmd_create(args):
sudo systemctl enable docker
sudo usermod -aG docker $USER

# Configure Docker to use /mnt (larger temp disk)
# Configure Docker to use persistent storage (NOT /mnt which is ephemeral
# and gets wiped on VM deallocate, breaking pool-resume)
sudo systemctl stop docker
sudo mkdir -p /mnt/docker
sudo bash -c 'echo "{\\"data-root\\": \\"/mnt/docker\\"}" > /etc/docker/daemon.json'
sudo mkdir -p /home/azureuser/docker
sudo bash -c 'echo "{\\"data-root\\": \\"/home/azureuser/docker\\"}" > /etc/docker/daemon.json'
sudo systemctl start docker

# Verify
docker --version
df -h /mnt
df -h /home
"""
result = ssh_run(ip, docker_setup, stream=True, step="CREATE")
if result.returncode != 0:
Expand Down
1 change: 1 addition & 0 deletions openadapt_evals/infrastructure/azure_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,7 @@ def _sdk_create_vm(
"image_reference": {"id": image_id} if image_id else _UBUNTU_2204_IMAGE,
"os_disk": {
"create_option": "FromImage",
"disk_size_gb": 128,
"managed_disk": {"storage_account_type": "Premium_LRS"},
},
},
Expand Down
14 changes: 8 additions & 6 deletions openadapt_evals/infrastructure/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ class PoolRunResult:
sudo systemctl enable docker
sudo usermod -aG docker $USER

# Configure Docker to use /mnt (larger temp disk)
# Configure Docker to use persistent storage (NOT /mnt which is ephemeral
# and gets wiped on VM deallocate, breaking pool-resume)
sudo systemctl stop docker
sudo mkdir -p /mnt/docker
sudo bash -c 'echo "{\\"data-root\\": \\"/mnt/docker\\"}" > /etc/docker/daemon.json'
sudo mkdir -p /home/azureuser/docker
sudo bash -c 'echo "{\\"data-root\\": \\"/home/azureuser/docker\\"}" > /etc/docker/daemon.json'
sudo systemctl start docker

# Pull base images (use sudo since usermod hasn't taken effect yet)
Expand Down Expand Up @@ -110,10 +111,11 @@ class PoolRunResult:
sudo systemctl enable docker
sudo usermod -aG docker $USER

# Configure Docker to use /mnt (larger temp disk)
# Configure Docker to use persistent storage (NOT /mnt which is ephemeral
# and gets wiped on VM deallocate, breaking pool-resume)
sudo systemctl stop docker
sudo mkdir -p /mnt/docker
sudo bash -c 'echo "{{\\"data-root\\": \\"/mnt/docker\\"}}" > /etc/docker/daemon.json'
sudo mkdir -p /home/azureuser/docker
sudo bash -c 'echo "{{\\"data-root\\": \\"/home/azureuser/docker\\"}}" > /etc/docker/daemon.json'
sudo systemctl start docker

# Pull pre-built image from ACR (faster than building)
Expand Down