Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
478 changes: 478 additions & 0 deletions docs/design/custom_task_evaluation.md

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions example_tasks/calc-formula.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Medium task: enter values and a SUM formula in LibreOffice Calc
name: "Enter values in A1:A3 and a SUM formula in A4 in LibreOffice Calc"
id: custom-calc-formula

setup:
- execute: "powershell -c 'Stop-Process -Name soffice* -Force -ErrorAction SilentlyContinue'"
- sleep: 2
- launch: "soffice --calc"
- sleep: 5

evaluate:
- check: command
run: |
powershell -c "Get-Process soffice* -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count"
expect: "1"
match: exact

- check: screenshot
description: "LibreOffice Calc is open with values in cells A1, A2, A3 and a SUM formula result in A4"

combine: and
max_steps: 20

milestones:
- name: "Calc is open"
check: command
run: "powershell -c \"Get-Process soffice* -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
expect: "1"
match: exact

- name: "Values entered"
check: screenshot
description: "Cells A1, A2, and A3 contain numeric values"

- name: "Formula entered"
check: screenshot
description: "Cell A4 contains a formula result (a number)"
63 changes: 63 additions & 0 deletions example_tasks/clear-browsing-data-chrome.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Clear browsing data in Google Chrome
# This is a WAA benchmark staple — tests navigation through Settings UI
name: "Clear browsing data in Google Chrome"
id: custom-clear-chrome-data

setup:
# Populate Chrome with browsing history so there's data to clear
- execute: "powershell -c \"Start-Process chrome 'https://example.com' -WindowStyle Normal\""
- sleep: 3
- execute: "powershell -c \"Start-Process chrome 'https://wikipedia.org'\""
- sleep: 2
# Close Chrome so the agent starts fresh
- execute: "powershell -c 'Stop-Process -Name chrome -Force -ErrorAction SilentlyContinue'"
- sleep: 2

evaluate:
# Check 1: Chrome history is empty after clearing
- check: command
run: |
powershell -c "
$histPath = \"$env:LOCALAPPDATA\\Google\\Chrome\\User Data\\Default\\History\"
if (Test-Path $histPath) {
$size = (Get-Item $histPath).Length
if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
} else { Write-Output 'no_history_file' }
"
expect: "cleared"
match: contains

# Check 2: VLM confirms the "Clear browsing data" dialog was used
- check: screenshot
description: "Chrome shows a confirmation that browsing data has been cleared, or the Settings page for clearing data is visible with completed state"

combine: or
max_steps: 20

milestones:
- name: "Chrome is open"
check: command
run: "powershell -c \"Get-Process chrome -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
expect: "1"
match: contains

- name: "Settings page is open"
check: screenshot
description: "Chrome Settings page is visible, or chrome://settings is in the address bar"

- name: "Clear browsing data dialog is open"
check: screenshot
description: "The 'Clear browsing data' dialog or panel is visible in Chrome"

- name: "Data is cleared"
check: command
run: |
powershell -c "
$histPath = \"$env:LOCALAPPDATA\\Google\\Chrome\\User Data\\Default\\History\"
if (Test-Path $histPath) {
$size = (Get-Item $histPath).Length
if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
} else { Write-Output 'no_history_file' }
"
expect: "cleared"
match: contains
63 changes: 63 additions & 0 deletions example_tasks/clear-browsing-data-edge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Clear browsing data in Microsoft Edge
# Edge is the default browser on Windows 11 WAA VMs
name: "Clear browsing data in Microsoft Edge"
id: custom-clear-edge-data

setup:
# Populate Edge with browsing history
- execute: "powershell -c \"Start-Process msedge 'https://example.com' -WindowStyle Normal\""
- sleep: 3
- execute: "powershell -c \"Start-Process msedge 'https://wikipedia.org'\""
- sleep: 2
# Close Edge so the agent starts fresh
- execute: "powershell -c 'Stop-Process -Name msedge -Force -ErrorAction SilentlyContinue'"
- sleep: 2

evaluate:
# Check 1: Edge history is empty after clearing
- check: command
run: |
powershell -c "
$histPath = \"$env:LOCALAPPDATA\\Microsoft\\Edge\\User Data\\Default\\History\"
if (Test-Path $histPath) {
$size = (Get-Item $histPath).Length
if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
} else { Write-Output 'no_history_file' }
"
expect: "cleared"
match: contains

# Check 2: VLM confirms clearing
- check: screenshot
description: "Microsoft Edge shows a confirmation that browsing data has been cleared, or the Settings page shows completed clearing state"

combine: or
max_steps: 20

milestones:
- name: "Edge is open"
check: command
run: "powershell -c \"Get-Process msedge -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
expect: "1"
match: contains

- name: "Settings page is open"
check: screenshot
description: "Edge Settings page is visible, or edge://settings is in the address bar"

- name: "Clear browsing data dialog is open"
check: screenshot
description: "The 'Clear browsing data' dialog or panel is visible in Edge"

- name: "Data is cleared"
check: command
run: |
powershell -c "
$histPath = \"$env:LOCALAPPDATA\\Microsoft\\Edge\\User Data\\Default\\History\"
if (Test-Path $histPath) {
$size = (Get-Item $histPath).Length
if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
} else { Write-Output 'no_history_file' }
"
expect: "cleared"
match: contains
26 changes: 26 additions & 0 deletions example_tasks/create-desktop-folder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Simple 2-step task: create a folder named "TestFolder" on the Desktop
name: "Create a folder named TestFolder on the Desktop"
id: custom-desktop-folder

setup:
- execute: "powershell -c \"Remove-Item -Path $env:USERPROFILE\\Desktop\\TestFolder -Recurse -Force -ErrorAction SilentlyContinue\""
- sleep: 1

evaluate:
- check: command
run: "powershell -c \"Test-Path $env:USERPROFILE\\Desktop\\TestFolder\""
expect: "True"
match: exact

max_steps: 10

milestones:
- name: "Desktop is visible"
check: screenshot
description: "The Windows desktop is visible with icons"

- name: "Folder exists"
check: command
run: "powershell -c \"Test-Path $env:USERPROFILE\\Desktop\\TestFolder\""
expect: "True"
match: exact
30 changes: 30 additions & 0 deletions example_tasks/notepad-hello.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Simple 2-step task: open Notepad and type "Hello World"
name: "Open Notepad and type Hello World"
id: custom-notepad-hello

setup:
- execute: "powershell -c 'Stop-Process -Name notepad -Force -ErrorAction SilentlyContinue'"
- sleep: 1

evaluate:
- check: command
run: "powershell -c \"Get-Process notepad -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
expect: "1"
match: exact

- check: screenshot
description: "Notepad window is open with 'Hello World' typed in the text area"

combine: and
max_steps: 10

milestones:
- name: "Notepad is open"
check: command
run: "powershell -c \"Get-Process notepad -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
expect: "1"
match: exact

- name: "Text is typed"
check: screenshot
description: "Notepad shows 'Hello World' in the text area"
102 changes: 98 additions & 4 deletions openadapt_evals/adapters/rl_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
BenchmarkTask,
)

# Avoid circular import — TaskConfig imported lazily
TYPE_CHECKING = False
if TYPE_CHECKING:
from openadapt_evals.task_config import TaskConfig

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -112,10 +117,12 @@ def __init__(
adapter: BenchmarkAdapter,
default_task_id: str | None = None,
evaluate_every_step: bool = False,
task_config: TaskConfig | None = None,
):
self._adapter = adapter
self._default_task_id = default_task_id
self._evaluate_every_step = evaluate_every_step
self._task_config = task_config
self._current_task: BenchmarkTask | None = None
self._step_count = 0
self._done = False
Expand Down Expand Up @@ -160,6 +167,23 @@ def screen_size(self) -> tuple[int, int]:
return (width, height)
return (1920, 1200)

def load_task_config(self, task_config: TaskConfig) -> None:
"""Set a TaskConfig for dense reward evaluation.

When set, collect_rollout() and evaluate_dense() use milestone-based
partial credit instead of binary evaluation.

Args:
task_config: A TaskConfig loaded from YAML.
"""
self._task_config = task_config
self._default_task_id = task_config.id
logger.info(
"Loaded TaskConfig: %s (%d milestones)",
task_config.name,
len(task_config.milestones),
)

def reset(self, config: ResetConfig | None = None) -> BenchmarkObservation:
"""Reset environment to a task's initial state.

Expand All @@ -186,8 +210,15 @@ def reset(self, config: ResetConfig | None = None) -> BenchmarkObservation:
"Pass task_id in ResetConfig or set default_task_id in constructor."
)

# Load and reset the task
self._current_task = self._adapter.load_task(task_id)
# Load the task — prefer TaskConfig if available (avoids server lookup)
if self._task_config and self._task_config.id == task_id:
self._current_task = self._task_config.to_benchmark_task()
elif hasattr(self._adapter, "load_task_from_json") and self._task_config:
self._current_task = self._adapter.load_task_from_json(
task_id, self._task_config.to_waa_config()
)
else:
self._current_task = self._adapter.load_task(task_id)
obs = self._adapter.reset(self._current_task)

# Reset episode state
Expand Down Expand Up @@ -429,6 +460,66 @@ def evaluate(self) -> float:
)
return result.score

def evaluate_dense(self) -> float:
"""Evaluate using dense partial rewards via milestones.

If a TaskConfig with milestones is set, returns the fraction of
milestones passed (0.0 to 1.0). Falls back to binary evaluate()
if no TaskConfig or no milestones are defined.

This gives GRPO gradient signal even when no task fully completes:
an agent that passes 3/5 milestones gets reward 0.6 vs 0.0 for
one that passes 0/5.

Returns:
Dense reward score between 0.0 and 1.0.
"""
if self._current_task is None:
raise RuntimeError("Call reset() before evaluate_dense().")

# Try milestone evaluation first
if self._task_config and self._task_config.milestones:
screenshot = b""
if self._last_obs and self._last_obs.screenshot:
screenshot = self._last_obs.screenshot

server_url = getattr(
getattr(self._adapter, "config", None), "server_url", ""
) or ""

passed, total = self._task_config.evaluate_milestones(
screenshot, server_url
)
if total > 0:
milestone_score = passed / total

# Also try binary evaluation if available
try:
binary_score = self.evaluate()
except Exception:
binary_score = 0.0

# Use the higher of milestone score and binary score
# This way, full task completion (1.0) always beats partial (0.6)
score = max(milestone_score, binary_score)

# Backfill reward on last trajectory step
if self._trajectory:
self._trajectory[-1].reward = score
self._trajectory[-1].info["milestone_score"] = milestone_score
self._trajectory[-1].info["binary_score"] = binary_score
self._trajectory[-1].info["milestones_passed"] = passed
self._trajectory[-1].info["milestones_total"] = total

logger.info(
"Dense evaluation: milestones=%d/%d (%.2f), binary=%.2f, final=%.2f",
passed, total, milestone_score, binary_score, score,
)
return score

# Fallback to binary evaluation
return self.evaluate()

def collect_rollout(
self,
agent_fn: Callable[[BenchmarkObservation], BenchmarkAction],
Expand Down Expand Up @@ -493,8 +584,11 @@ def collect_rollout(
if rollout_step.done:
break

# Evaluate and backfill reward
score = self.evaluate()
# Evaluate and backfill reward — use dense rewards if milestones exist
if self._task_config and self._task_config.milestones:
score = self.evaluate_dense()
else:
score = self.evaluate()

logger.info(
"Rollout complete: %d steps, score=%.2f",
Expand Down
Loading
Loading