Skip to content

Commit 639a6a2

Browse files
abrichrclaude
andauthored
feat: add TaskVerifierRegistry for custom task verification (#89)
Add a registry pattern for custom task verifiers that can inspect VM state after task execution. This enables GoTo IT Autopilot (and other integrators) to register domain-specific verification functions without subclassing BenchmarkAdapter. - TaskVerifierRegistry with decorator and programmatic registration - VerificationResult dataclass with success/score/details - WAALiveAdapter.run_powershell() for executing PowerShell on the VM - Built-in clear_browsing_data reference verifier - 33 tests covering registry operations and built-in verifiers - Exports from evaluation package and main package __init__ Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ca6a936 commit 639a6a2

6 files changed

Lines changed: 754 additions & 0 deletions

File tree

openadapt_evals/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@
104104
LiveEvaluationTracker,
105105
save_execution_trace,
106106
)
107+
from openadapt_evals.evaluation.verifier_registry import (
108+
TaskVerifierRegistry,
109+
VerificationResult,
110+
)
107111

108112
# Lazy imports for optional dependencies
109113
def __getattr__(name: str):
@@ -140,6 +144,9 @@ def __getattr__(name: str):
140144
"evaluate_agent_on_benchmark",
141145
"compute_metrics",
142146
"compute_domain_metrics",
147+
# Task verification
148+
"TaskVerifierRegistry",
149+
"VerificationResult",
143150
# WAA adapters
144151
"WAAAdapter",
145152
"WAAConfig",

openadapt_evals/adapters/waa/live.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,63 @@ def _recover_failsafe(self) -> bool:
678678
logger.error(f"Fail-safe recovery request failed: {e}")
679679
return False
680680

681+
def run_powershell(self, script: str) -> str:
682+
"""Execute a PowerShell command on the Windows VM and return stdout.
683+
684+
Sends the script to the WAA server's ``/execute`` endpoint wrapped
685+
in a ``python -c "..."`` command that invokes PowerShell via
686+
``subprocess.run``.
687+
688+
This is primarily intended for use by task verifiers that need to
689+
inspect VM state (e.g., checking file counts, registry values, etc.).
690+
691+
Args:
692+
script: PowerShell command or script to execute. Multi-line
693+
scripts are supported but should be kept simple.
694+
695+
Returns:
696+
The stdout output from the PowerShell command as a string.
697+
698+
Raises:
699+
RuntimeError: If the command execution fails or the server
700+
is unreachable.
701+
"""
702+
import requests
703+
704+
# Build a python -c command that runs PowerShell via subprocess.
705+
# The /execute endpoint requires the "python -c ..." format.
706+
# We use repr() for safe escaping of the PowerShell script.
707+
python_code = (
708+
"import subprocess; "
709+
f"r = subprocess.run(['powershell', '-Command', {repr(script)}], "
710+
"capture_output=True, text=True, timeout=30); "
711+
"print(r.stdout)"
712+
)
713+
command = f'python -c "{python_code}"'
714+
715+
try:
716+
resp = requests.post(
717+
f"{self.config.server_url}/execute",
718+
json={"command": command},
719+
timeout=self.config.timeout,
720+
)
721+
if resp.status_code == 200:
722+
result = resp.json()
723+
stdout = result.get("stdout", "")
724+
stderr = result.get("stderr", "")
725+
if stderr:
726+
logger.warning("PowerShell stderr: %s", stderr)
727+
return stdout
728+
else:
729+
raise RuntimeError(
730+
f"PowerShell execution failed (HTTP {resp.status_code}): "
731+
f"{resp.text}"
732+
)
733+
except requests.RequestException as e:
734+
raise RuntimeError(
735+
f"Failed to connect to WAA server for PowerShell execution: {e}"
736+
) from e
737+
681738
def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
682739
"""Evaluate current state against task success criteria.
683740

openadapt_evals/evaluation/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
This module provides client-side evaluation without requiring a sidecar service.
44
The evaluators run locally, making HTTP calls to the WAA server's /execute endpoint.
55
6+
It also provides a task verifier registry for registering custom verification
7+
functions that can inspect the VM state after task execution.
8+
69
Example usage:
710
from openadapt_evals.evaluation import EvaluatorClient, discover_vm_ip
811
@@ -16,15 +19,34 @@
1619
# Evaluate a task
1720
result = client.evaluate(task_config)
1821
print(f"Success: {result.success}, Score: {result.score}")
22+
23+
# Custom task verification
24+
from openadapt_evals.evaluation import register, registry, VerificationResult
25+
26+
@register("my_task")
27+
def verify_my_task(adapter):
28+
return VerificationResult(success=True, score=1.0)
29+
30+
result = registry.verify("my_task", adapter)
1931
"""
2032

2133
from .discovery import VMIPDiscovery, DiscoveryMethod, discover_vm_ip
2234
from .client import EvaluatorClient, EvaluationResult
35+
from .verifier_registry import (
36+
TaskVerifierRegistry,
37+
VerificationResult,
38+
register,
39+
registry,
40+
)
2341

2442
__all__ = [
2543
"VMIPDiscovery",
2644
"DiscoveryMethod",
2745
"discover_vm_ip",
2846
"EvaluatorClient",
2947
"EvaluationResult",
48+
"TaskVerifierRegistry",
49+
"VerificationResult",
50+
"register",
51+
"registry",
3052
]
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""Built-in task verifiers for common verification scenarios.
2+
3+
This module contains reference verifier implementations that demonstrate the
4+
verifier registry pattern. Import this module to register the built-in
5+
verifiers with the global registry.
6+
7+
Example:
8+
# Import to register built-in verifiers
9+
import openadapt_evals.evaluation.builtin_verifiers
10+
11+
from openadapt_evals.evaluation.verifier_registry import registry
12+
result = registry.verify("clear_browsing_data", adapter)
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import logging
18+
19+
from openadapt_evals.evaluation.verifier_registry import (
20+
VerificationResult,
21+
register,
22+
)
23+
24+
logger = logging.getLogger(__name__)
25+
26+
27+
@register("clear_browsing_data")
28+
def verify_clear_browsing_data(adapter) -> VerificationResult:
29+
"""Verify Chrome browsing data has been cleared.
30+
31+
Checks that Chrome's cache directory is empty by running a PowerShell
32+
command on the Windows VM via the adapter's run_powershell() method.
33+
34+
Args:
35+
adapter: A BenchmarkAdapter with run_powershell() support
36+
(e.g., WAALiveAdapter).
37+
38+
Returns:
39+
VerificationResult indicating whether the cache was cleared.
40+
"""
41+
try:
42+
result = adapter.run_powershell(
43+
"(Get-ChildItem -Path "
44+
"$env:LOCALAPPDATA\\Google\\Chrome\\"
45+
"'User Data'\\Default\\Cache "
46+
"-Recurse -ErrorAction SilentlyContinue "
47+
"| Measure-Object).Count"
48+
)
49+
count = int(result.strip())
50+
success = count == 0
51+
return VerificationResult(
52+
success=success,
53+
score=1.0 if success else 0.0,
54+
details={
55+
"cache_file_count": count,
56+
"cache_path": (
57+
"%LOCALAPPDATA%\\Google\\Chrome\\"
58+
"User Data\\Default\\Cache"
59+
),
60+
},
61+
)
62+
except Exception as e:
63+
logger.error("Failed to verify clear_browsing_data: %s", e)
64+
return VerificationResult(
65+
success=False,
66+
score=0.0,
67+
details={"error": str(e)},
68+
)

0 commit comments

Comments
 (0)