OpenAdaptAI
diff --git a/‎docs/design/custom_task_evaluation.md‎
Lines changed: 478 additions & 0 deletions b/‎docs/design/custom_task_evaluation.md‎
Lines changed: 478 additions & 0 deletions
diff --git a/‎example_tasks/calc-formula.yaml‎
Lines changed: 37 additions & 0 deletions b/‎example_tasks/calc-formula.yaml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎example_tasks/clear-browsing-data-chrome.yaml‎
Lines changed: 63 additions & 0 deletions b/‎example_tasks/clear-browsing-data-chrome.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎example_tasks/clear-browsing-data-edge.yaml‎
Lines changed: 63 additions & 0 deletions b/‎example_tasks/clear-browsing-data-edge.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎example_tasks/create-desktop-folder.yaml‎
Lines changed: 26 additions & 0 deletions b/‎example_tasks/create-desktop-folder.yaml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎example_tasks/notepad-hello.yaml‎
Lines changed: 30 additions & 0 deletions b/‎example_tasks/notepad-hello.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎openadapt_evals/adapters/rl_env.py‎
Lines changed: 98 additions & 4 deletions b/‎openadapt_evals/adapters/rl_env.py‎
Lines changed: 98 additions & 4 deletions
@@ -0,0 +1,37 @@
+# Medium task: enter values and a SUM formula in LibreOffice Calc
+name: "Enter values in A1:A3 and a SUM formula in A4 in LibreOffice Calc"
+id: custom-calc-formula
+
+setup:
+  - execute: "powershell -c 'Stop-Process -Name soffice* -Force -ErrorAction SilentlyContinue'"
+  - sleep: 2
+  - launch: "soffice --calc"
+  - sleep: 5
+
+evaluate:
+  - check: command
+    run: |
+      powershell -c "Get-Process soffice* -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count"
+    expect: "1"
+    match: exact
+
+  - check: screenshot
+    description: "LibreOffice Calc is open with values in cells A1, A2, A3 and a SUM formula result in A4"
+
+combine: and
+max_steps: 20
+
+milestones:
+  - name: "Calc is open"
+    check: command
+    run: "powershell -c \"Get-Process soffice* -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
+    expect: "1"
+    match: exact
+
+  - name: "Values entered"
+    check: screenshot
+    description: "Cells A1, A2, and A3 contain numeric values"
+
+  - name: "Formula entered"
+    check: screenshot
+    description: "Cell A4 contains a formula result (a number)"
@@ -0,0 +1,63 @@
+# Clear browsing data in Google Chrome
+# This is a WAA benchmark staple — tests navigation through Settings UI
+name: "Clear browsing data in Google Chrome"
+id: custom-clear-chrome-data
+
+setup:
+  # Populate Chrome with browsing history so there's data to clear
+  - execute: "powershell -c \"Start-Process chrome 'https://example.com' -WindowStyle Normal\""
+  - sleep: 3
+  - execute: "powershell -c \"Start-Process chrome 'https://wikipedia.org'\""
+  - sleep: 2
+  # Close Chrome so the agent starts fresh
+  - execute: "powershell -c 'Stop-Process -Name chrome -Force -ErrorAction SilentlyContinue'"
+  - sleep: 2
+
+evaluate:
+  # Check 1: Chrome history is empty after clearing
+  - check: command
+    run: |
+      powershell -c "
+        $histPath = \"$env:LOCALAPPDATA\\Google\\Chrome\\User Data\\Default\\History\"
+        if (Test-Path $histPath) {
+          $size = (Get-Item $histPath).Length
+          if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
+        } else { Write-Output 'no_history_file' }
+      "
+    expect: "cleared"
+    match: contains
+
+  # Check 2: VLM confirms the "Clear browsing data" dialog was used
+  - check: screenshot
+    description: "Chrome shows a confirmation that browsing data has been cleared, or the Settings page for clearing data is visible with completed state"
+
+combine: or
+max_steps: 20
+
+milestones:
+  - name: "Chrome is open"
+    check: command
+    run: "powershell -c \"Get-Process chrome -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
+    expect: "1"
+    match: contains
+
+  - name: "Settings page is open"
+    check: screenshot
+    description: "Chrome Settings page is visible, or chrome://settings is in the address bar"
+
+  - name: "Clear browsing data dialog is open"
+    check: screenshot
+    description: "The 'Clear browsing data' dialog or panel is visible in Chrome"
+
+  - name: "Data is cleared"
+    check: command
+    run: |
+      powershell -c "
+        $histPath = \"$env:LOCALAPPDATA\\Google\\Chrome\\User Data\\Default\\History\"
+        if (Test-Path $histPath) {
+          $size = (Get-Item $histPath).Length
+          if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
+        } else { Write-Output 'no_history_file' }
+      "
+    expect: "cleared"
+    match: contains
@@ -0,0 +1,63 @@
+# Clear browsing data in Microsoft Edge
+# Edge is the default browser on Windows 11 WAA VMs
+name: "Clear browsing data in Microsoft Edge"
+id: custom-clear-edge-data
+
+setup:
+  # Populate Edge with browsing history
+  - execute: "powershell -c \"Start-Process msedge 'https://example.com' -WindowStyle Normal\""
+  - sleep: 3
+  - execute: "powershell -c \"Start-Process msedge 'https://wikipedia.org'\""
+  - sleep: 2
+  # Close Edge so the agent starts fresh
+  - execute: "powershell -c 'Stop-Process -Name msedge -Force -ErrorAction SilentlyContinue'"
+  - sleep: 2
+
+evaluate:
+  # Check 1: Edge history is empty after clearing
+  - check: command
+    run: |
+      powershell -c "
+        $histPath = \"$env:LOCALAPPDATA\\Microsoft\\Edge\\User Data\\Default\\History\"
+        if (Test-Path $histPath) {
+          $size = (Get-Item $histPath).Length
+          if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
+        } else { Write-Output 'no_history_file' }
+      "
+    expect: "cleared"
+    match: contains
+
+  # Check 2: VLM confirms clearing
+  - check: screenshot
+    description: "Microsoft Edge shows a confirmation that browsing data has been cleared, or the Settings page shows completed clearing state"
+
+combine: or
+max_steps: 20
+
+milestones:
+  - name: "Edge is open"
+    check: command
+    run: "powershell -c \"Get-Process msedge -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
+    expect: "1"
+    match: contains
+
+  - name: "Settings page is open"
+    check: screenshot
+    description: "Edge Settings page is visible, or edge://settings is in the address bar"
+
+  - name: "Clear browsing data dialog is open"
+    check: screenshot
+    description: "The 'Clear browsing data' dialog or panel is visible in Edge"
+
+  - name: "Data is cleared"
+    check: command
+    run: |
+      powershell -c "
+        $histPath = \"$env:LOCALAPPDATA\\Microsoft\\Edge\\User Data\\Default\\History\"
+        if (Test-Path $histPath) {
+          $size = (Get-Item $histPath).Length
+          if ($size -lt 50000) { Write-Output 'cleared' } else { Write-Output 'not_cleared' }
+        } else { Write-Output 'no_history_file' }
+      "
+    expect: "cleared"
+    match: contains
@@ -0,0 +1,26 @@
+# Simple 2-step task: create a folder named "TestFolder" on the Desktop
+name: "Create a folder named TestFolder on the Desktop"
+id: custom-desktop-folder
+
+setup:
+  - execute: "powershell -c \"Remove-Item -Path $env:USERPROFILE\\Desktop\\TestFolder -Recurse -Force -ErrorAction SilentlyContinue\""
+  - sleep: 1
+
+evaluate:
+  - check: command
+    run: "powershell -c \"Test-Path $env:USERPROFILE\\Desktop\\TestFolder\""
+    expect: "True"
+    match: exact
+
+max_steps: 10
+
+milestones:
+  - name: "Desktop is visible"
+    check: screenshot
+    description: "The Windows desktop is visible with icons"
+
+  - name: "Folder exists"
+    check: command
+    run: "powershell -c \"Test-Path $env:USERPROFILE\\Desktop\\TestFolder\""
+    expect: "True"
+    match: exact
@@ -0,0 +1,30 @@
+# Simple 2-step task: open Notepad and type "Hello World"
+name: "Open Notepad and type Hello World"
+id: custom-notepad-hello
+
+setup:
+  - execute: "powershell -c 'Stop-Process -Name notepad -Force -ErrorAction SilentlyContinue'"
+  - sleep: 1
+
+evaluate:
+  - check: command
+    run: "powershell -c \"Get-Process notepad -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
+    expect: "1"
+    match: exact
+
+  - check: screenshot
+    description: "Notepad window is open with 'Hello World' typed in the text area"
+
+combine: and
+max_steps: 10
+
+milestones:
+  - name: "Notepad is open"
+    check: command
+    run: "powershell -c \"Get-Process notepad -ErrorAction SilentlyContinue | Measure | Select -ExpandProperty Count\""
+    expect: "1"
+    match: exact
+
+  - name: "Text is typed"
+    check: screenshot
+    description: "Notepad shows 'Hello World' in the text area"
@@ -50,6 +50,11 @@
     BenchmarkTask,
 )
 
+# Avoid circular import — TaskConfig imported lazily
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from openadapt_evals.task_config import TaskConfig
+
 logger = logging.getLogger(__name__)
 
 
@@ -112,10 +117,12 @@ def __init__(
         adapter: BenchmarkAdapter,
         default_task_id: str | None = None,
         evaluate_every_step: bool = False,
+        task_config: TaskConfig | None = None,
     ):
         self._adapter = adapter
         self._default_task_id = default_task_id
         self._evaluate_every_step = evaluate_every_step
+        self._task_config = task_config
         self._current_task: BenchmarkTask | None = None
         self._step_count = 0
         self._done = False
@@ -160,6 +167,23 @@ def screen_size(self) -> tuple[int, int]:
             return (width, height)
         return (1920, 1200)
 
+    def load_task_config(self, task_config: TaskConfig) -> None:
+        """Set a TaskConfig for dense reward evaluation.
+
+        When set, collect_rollout() and evaluate_dense() use milestone-based
+        partial credit instead of binary evaluation.
+
+        Args:
+            task_config: A TaskConfig loaded from YAML.
+        """
+        self._task_config = task_config
+        self._default_task_id = task_config.id
+        logger.info(
+            "Loaded TaskConfig: %s (%d milestones)",
+            task_config.name,
+            len(task_config.milestones),
+        )
+
     def reset(self, config: ResetConfig | None = None) -> BenchmarkObservation:
         """Reset environment to a task's initial state.
 
@@ -186,8 +210,15 @@ def reset(self, config: ResetConfig | None = None) -> BenchmarkObservation:
                 "Pass task_id in ResetConfig or set default_task_id in constructor."
             )
 
-        # Load and reset the task
-        self._current_task = self._adapter.load_task(task_id)
+        # Load the task — prefer TaskConfig if available (avoids server lookup)
+        if self._task_config and self._task_config.id == task_id:
+            self._current_task = self._task_config.to_benchmark_task()
+        elif hasattr(self._adapter, "load_task_from_json") and self._task_config:
+            self._current_task = self._adapter.load_task_from_json(
+                task_id, self._task_config.to_waa_config()
+            )
+        else:
+            self._current_task = self._adapter.load_task(task_id)
         obs = self._adapter.reset(self._current_task)
 
         # Reset episode state
@@ -429,6 +460,66 @@ def evaluate(self) -> float:
         )
         return result.score
 
+    def evaluate_dense(self) -> float:
+        """Evaluate using dense partial rewards via milestones.
+
+        If a TaskConfig with milestones is set, returns the fraction of
+        milestones passed (0.0 to 1.0). Falls back to binary evaluate()
+        if no TaskConfig or no milestones are defined.
+
+        This gives GRPO gradient signal even when no task fully completes:
+        an agent that passes 3/5 milestones gets reward 0.6 vs 0.0 for
+        one that passes 0/5.
+
+        Returns:
+            Dense reward score between 0.0 and 1.0.
+        """
+        if self._current_task is None:
+            raise RuntimeError("Call reset() before evaluate_dense().")
+
+        # Try milestone evaluation first
+        if self._task_config and self._task_config.milestones:
+            screenshot = b""
+            if self._last_obs and self._last_obs.screenshot:
+                screenshot = self._last_obs.screenshot
+
+            server_url = getattr(
+                getattr(self._adapter, "config", None), "server_url", ""
+            ) or ""
+
+            passed, total = self._task_config.evaluate_milestones(
+                screenshot, server_url
+            )
+            if total > 0:
+                milestone_score = passed / total
+
+                # Also try binary evaluation if available
+                try:
+                    binary_score = self.evaluate()
+                except Exception:
+                    binary_score = 0.0
+
+                # Use the higher of milestone score and binary score
+                # This way, full task completion (1.0) always beats partial (0.6)
+                score = max(milestone_score, binary_score)
+
+                # Backfill reward on last trajectory step
+                if self._trajectory:
+                    self._trajectory[-1].reward = score
+                    self._trajectory[-1].info["milestone_score"] = milestone_score
+                    self._trajectory[-1].info["binary_score"] = binary_score
+                    self._trajectory[-1].info["milestones_passed"] = passed
+                    self._trajectory[-1].info["milestones_total"] = total
+
+                logger.info(
+                    "Dense evaluation: milestones=%d/%d (%.2f), binary=%.2f, final=%.2f",
+                    passed, total, milestone_score, binary_score, score,
+                )
+                return score
+
+        # Fallback to binary evaluation
+        return self.evaluate()
+
     def collect_rollout(
         self,
         agent_fn: Callable[[BenchmarkObservation], BenchmarkAction],
@@ -493,8 +584,11 @@ def collect_rollout(
             if rollout_step.done:
                 break
 
-        # Evaluate and backfill reward
-        score = self.evaluate()
+        # Evaluate and backfill reward — use dense rewards if milestones exist
+        if self._task_config and self._task_config.milestones:
+            score = self.evaluate_dense()
+        else:
+            score = self.evaluate()
 
         logger.info(
             "Rollout complete: %d steps, score=%.2f",