Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions openadapt_evals/adapters/rl_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
at the final step. This matches GRPO where reward comes from an outcome
verifier, not per-step shaping.

When ``evaluate_every_step=True``, the evaluator is called after each
step and the score is included in ``info["evaluation_score"]``. The
reward signal is NOT changed — training code decides how to use the
per-step evaluation data.

Example:
from openadapt_evals.adapters.waa.live import WAALiveAdapter, WAALiveConfig
from openadapt_evals.adapters.rl_env import RLEnvironment
Expand All @@ -23,6 +28,12 @@
if obs_step.done:
break
score = env.evaluate()

# With per-step evaluation (for RL training loops):
env = RLEnvironment(adapter, default_task_id="<WAA_UUID>", evaluate_every_step=True)
obs = env.reset()
step = env.step(BenchmarkAction(type="click", x=0.5, y=0.3))
print(step.info["evaluation_score"]) # 0.0 or 1.0
"""

from __future__ import annotations
Expand Down Expand Up @@ -100,9 +111,11 @@ def __init__(
self,
adapter: BenchmarkAdapter,
default_task_id: str | None = None,
evaluate_every_step: bool = False,
):
self._adapter = adapter
self._default_task_id = default_task_id
self._evaluate_every_step = evaluate_every_step
self._current_task: BenchmarkTask | None = None
self._step_count = 0
self._done = False
Expand Down Expand Up @@ -224,6 +237,16 @@ def step(self, action: BenchmarkAction) -> RolloutStep:
self._done = done
info["step"] = self._step_count

# Optional per-step evaluation for RL training loops
if self._evaluate_every_step and self._current_task is not None:
try:
result = self._adapter.evaluate(self._current_task)
info["evaluation_score"] = result.score
info["evaluation_success"] = result.success
except Exception as e:
logger.warning("Per-step evaluation failed at step %d: %s", self._step_count, e)
info["evaluation_error"] = str(e)

rollout_step = RolloutStep(
observation=obs,
action=action,
Expand Down
12 changes: 6 additions & 6 deletions openadapt_evals/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- SmartMockAgent: Designed to pass mock adapter tests
- ApiAgent: Uses Claude/GPT APIs directly (for WAA)
- ClaudeComputerUseAgent: Uses Claude's native computer_use tool
- HttpAgent: Delegates to a remote HTTP agent endpoint
- Qwen3VLAgent: Uses Qwen3-VL for local inference
- SmolOperatorAgent: Uses SmolVLM2-2.2B for local inference
- PolicyAgent: Uses local trained policy model
Expand All @@ -18,16 +19,13 @@

Example:
```python
from openadapt_evals.agents import ApiAgent, ScriptedAgent, RetrievalAugmentedAgent
from openadapt_evals.agents import ApiAgent, ScriptedAgent, HttpAgent

# Use API agent with Claude
agent = ApiAgent(provider="anthropic")

# Use retrieval-augmented agent with automatic demo selection
agent = RetrievalAugmentedAgent(
demo_library_path="/path/to/demo_library",
provider="anthropic",
)
# Use remote agent-as-a-service
agent = HttpAgent(endpoint_url="http://gpu-box:8080")

# Use scripted agent for replay
agent = ScriptedAgent([
Expand Down Expand Up @@ -58,6 +56,7 @@
)
from openadapt_evals.agents.api_agent import ApiAgent
from openadapt_evals.agents.claude_computer_use_agent import ClaudeComputerUseAgent
from openadapt_evals.agents.http_agent import HttpAgent
from openadapt_evals.agents.retrieval_agent import RetrievalAugmentedAgent

# Lazy imports for agents requiring additional dependencies
Expand Down Expand Up @@ -86,6 +85,7 @@ def __getattr__(name: str):
"SmartMockAgent",
"ApiAgent",
"ClaudeComputerUseAgent",
"HttpAgent",
"Qwen3VLAgent",
"SmolOperatorAgent",
"PolicyAgent",
Expand Down
199 changes: 199 additions & 0 deletions openadapt_evals/agents/http_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""HTTP-backed agent for remote agent-as-a-service integration.

Forwards observations to any HTTP endpoint and parses the response
into a BenchmarkAction. This lets external teams deploy their own
agent stack (model + prompt + parsing) as a black-box HTTP server
without coupling to openadapt-evals internals.

Protocol:
POST {endpoint_url}/act
Request:
{
"screenshot_b64": "<base64 PNG>",
"instruction": "Click the Submit button",
"task_id": "notepad_1",
"viewport": [1920, 1200],
"accessibility_tree": {...},
"step_count": 3
}
Response:
{
"type": "click",
"x": 0.5,
"y": 0.3
}

GET {endpoint_url}/health -> 200 OK

Example:
from openadapt_evals.agents import HttpAgent

agent = HttpAgent(endpoint_url="http://gpu-box:8080")
action = agent.act(observation, task)
"""

from __future__ import annotations

import base64
import logging
from typing import Any

import requests

from openadapt_evals.adapters.base import (
BenchmarkAction,
BenchmarkObservation,
BenchmarkTask,
)
from openadapt_evals.agents.base import BenchmarkAgent

logger = logging.getLogger(__name__)


class HttpAgent(BenchmarkAgent):
"""Agent that delegates to a remote HTTP endpoint.

The remote server receives observation data (screenshot, task,
accessibility tree) and returns an action dict that maps directly
to BenchmarkAction fields.

Args:
endpoint_url: Base URL of the remote agent server (no trailing slash).
timeout: Request timeout in seconds.
headers: Optional extra HTTP headers (e.g. auth tokens).
"""

def __init__(
self,
endpoint_url: str,
timeout: int = 120,
headers: dict[str, str] | None = None,
):
self.endpoint_url = endpoint_url.rstrip("/")
self.timeout = timeout
self.headers = headers or {}
self._step_count = 0

logger.info("HttpAgent initialized: endpoint=%s", self.endpoint_url)

def act(
self,
observation: BenchmarkObservation,
task: BenchmarkTask,
history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
) -> BenchmarkAction:
"""Send observation to remote endpoint, parse response as BenchmarkAction."""
self._step_count += 1

# Encode screenshot
screenshot_b64 = None
if observation.screenshot:
screenshot_b64 = base64.b64encode(observation.screenshot).decode("ascii")

payload: dict[str, Any] = {
"screenshot_b64": screenshot_b64,
"instruction": task.instruction,
"task_id": task.task_id,
"viewport": list(observation.viewport) if observation.viewport else None,
"accessibility_tree": observation.accessibility_tree,
"step_count": self._step_count - 1,
}

try:
resp = requests.post(
f"{self.endpoint_url}/act",
json=payload,
headers=self.headers,
timeout=self.timeout,
)
resp.raise_for_status()
data = resp.json()
except requests.ConnectionError as e:
logger.error("Connection failed: %s", e)
return BenchmarkAction(
type="done",
raw_action={"error": f"connection_failed: {e}"},
)
except requests.Timeout as e:
logger.error("Request timed out: %s", e)
return BenchmarkAction(
type="done",
raw_action={"error": f"timeout: {e}"},
)
except requests.HTTPError as e:
logger.error("HTTP error: %s", e)
return BenchmarkAction(
type="done",
raw_action={"error": f"http_error: {e}"},
)
except (ValueError, KeyError) as e:
logger.error("Invalid response: %s", e)
return BenchmarkAction(
type="done",
raw_action={"error": f"invalid_response: {e}"},
)

return _parse_action_response(data)

def reset(self) -> None:
"""Reset agent state and optionally notify remote endpoint."""
self._step_count = 0
try:
requests.post(
f"{self.endpoint_url}/reset",
headers=self.headers,
timeout=10,
)
except requests.RequestException:
# Remote endpoint may not support /reset — that's fine
pass

def health_check(self) -> bool:
"""Check if the remote endpoint is reachable.

Returns:
True if GET /health returns 200, False otherwise.
"""
try:
resp = requests.get(
f"{self.endpoint_url}/health",
headers=self.headers,
timeout=10,
)
return resp.status_code == 200
except requests.RequestException:
return False


def _parse_action_response(data: dict[str, Any]) -> BenchmarkAction:
"""Convert a response dict into a BenchmarkAction.

The response dict should have at minimum a ``type`` field. All other
fields map directly to BenchmarkAction attributes.

Args:
data: Response dict from the remote agent.

Returns:
Parsed BenchmarkAction.
"""
action_type = data.get("type", "done")

return BenchmarkAction(
type=action_type,
x=data.get("x"),
y=data.get("y"),
target_node_id=data.get("target_node_id"),
target_bbox=tuple(data["target_bbox"]) if data.get("target_bbox") else None,
target_role=data.get("target_role"),
target_name=data.get("target_name"),
text=data.get("text"),
key=data.get("key"),
modifiers=data.get("modifiers"),
scroll_direction=data.get("scroll_direction"),
scroll_amount=data.get("scroll_amount"),
end_x=data.get("end_x"),
end_y=data.get("end_y"),
answer=data.get("answer"),
raw_action=data,
)
Loading
Loading