Skip to content

Commit 95ebea5

Browse files
abrichrclaude
andcommitted
feat: add HttpAgent, per-step evaluation, and lightweight trace export
Three platform infrastructure features: 1. HttpAgent (agents/http_agent.py): Generic agent-as-HTTP-service that forwards observations to any remote endpoint and parses BenchmarkAction responses. Enables teams to deploy custom agent stacks (model + prompt + parsing) as black-box HTTP servers, cleanly solving GPU/CPU separation. 2. Per-step evaluation in RLEnvironment: New evaluate_every_step parameter calls the WAA evaluator after each step and populates info["evaluation_score"]. Does NOT change the reward signal — training code decides how to use it. Useful for online RL training loops. 3. LightweightTraceExporter: Plain JSON + screenshots trace export with no openadapt-ml dependency. Produces episode JSON, manifest, and JSONL training samples in a universal format. All 34 new tests pass. 984 existing tests unaffected. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4a28653 commit 95ebea5

7 files changed

Lines changed: 1175 additions & 24 deletions

File tree

openadapt_evals/adapters/rl_env.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
at the final step. This matches GRPO where reward comes from an outcome
1111
verifier, not per-step shaping.
1212
13+
When ``evaluate_every_step=True``, the evaluator is called after each
14+
step and the score is included in ``info["evaluation_score"]``. The
15+
reward signal is NOT changed — training code decides how to use the
16+
per-step evaluation data.
17+
1318
Example:
1419
from openadapt_evals.adapters.waa.live import WAALiveAdapter, WAALiveConfig
1520
from openadapt_evals.adapters.rl_env import RLEnvironment
@@ -23,6 +28,12 @@
2328
if obs_step.done:
2429
break
2530
score = env.evaluate()
31+
32+
# With per-step evaluation (for RL training loops):
33+
env = RLEnvironment(adapter, default_task_id="<WAA_UUID>", evaluate_every_step=True)
34+
obs = env.reset()
35+
step = env.step(BenchmarkAction(type="click", x=0.5, y=0.3))
36+
print(step.info["evaluation_score"]) # 0.0 or 1.0
2637
"""
2738

2839
from __future__ import annotations
@@ -100,9 +111,11 @@ def __init__(
100111
self,
101112
adapter: BenchmarkAdapter,
102113
default_task_id: str | None = None,
114+
evaluate_every_step: bool = False,
103115
):
104116
self._adapter = adapter
105117
self._default_task_id = default_task_id
118+
self._evaluate_every_step = evaluate_every_step
106119
self._current_task: BenchmarkTask | None = None
107120
self._step_count = 0
108121
self._done = False
@@ -224,6 +237,16 @@ def step(self, action: BenchmarkAction) -> RolloutStep:
224237
self._done = done
225238
info["step"] = self._step_count
226239

240+
# Optional per-step evaluation for RL training loops
241+
if self._evaluate_every_step and self._current_task is not None:
242+
try:
243+
result = self._adapter.evaluate(self._current_task)
244+
info["evaluation_score"] = result.score
245+
info["evaluation_success"] = result.success
246+
except Exception as e:
247+
logger.warning("Per-step evaluation failed at step %d: %s", self._step_count, e)
248+
info["evaluation_error"] = str(e)
249+
227250
rollout_step = RolloutStep(
228251
observation=obs,
229252
action=action,

openadapt_evals/agents/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
- SmartMockAgent: Designed to pass mock adapter tests
1111
- ApiAgent: Uses Claude/GPT APIs directly (for WAA)
1212
- ClaudeComputerUseAgent: Uses Claude's native computer_use tool
13+
- HttpAgent: Delegates to a remote HTTP agent endpoint
1314
- Qwen3VLAgent: Uses Qwen3-VL for local inference
1415
- SmolOperatorAgent: Uses SmolVLM2-2.2B for local inference
1516
- PolicyAgent: Uses local trained policy model
@@ -18,16 +19,13 @@
1819
1920
Example:
2021
```python
21-
from openadapt_evals.agents import ApiAgent, ScriptedAgent, RetrievalAugmentedAgent
22+
from openadapt_evals.agents import ApiAgent, ScriptedAgent, HttpAgent
2223
2324
# Use API agent with Claude
2425
agent = ApiAgent(provider="anthropic")
2526
26-
# Use retrieval-augmented agent with automatic demo selection
27-
agent = RetrievalAugmentedAgent(
28-
demo_library_path="/path/to/demo_library",
29-
provider="anthropic",
30-
)
27+
# Use remote agent-as-a-service
28+
agent = HttpAgent(endpoint_url="http://gpu-box:8080")
3129
3230
# Use scripted agent for replay
3331
agent = ScriptedAgent([
@@ -58,6 +56,7 @@
5856
)
5957
from openadapt_evals.agents.api_agent import ApiAgent
6058
from openadapt_evals.agents.claude_computer_use_agent import ClaudeComputerUseAgent
59+
from openadapt_evals.agents.http_agent import HttpAgent
6160
from openadapt_evals.agents.retrieval_agent import RetrievalAugmentedAgent
6261

6362
# Lazy imports for agents requiring additional dependencies
@@ -86,6 +85,7 @@ def __getattr__(name: str):
8685
"SmartMockAgent",
8786
"ApiAgent",
8887
"ClaudeComputerUseAgent",
88+
"HttpAgent",
8989
"Qwen3VLAgent",
9090
"SmolOperatorAgent",
9191
"PolicyAgent",
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
"""HTTP-backed agent for remote agent-as-a-service integration.
2+
3+
Forwards observations to any HTTP endpoint and parses the response
4+
into a BenchmarkAction. This lets external teams deploy their own
5+
agent stack (model + prompt + parsing) as a black-box HTTP server
6+
without coupling to openadapt-evals internals.
7+
8+
Protocol:
9+
POST {endpoint_url}/act
10+
Request:
11+
{
12+
"screenshot_b64": "<base64 PNG>",
13+
"instruction": "Click the Submit button",
14+
"task_id": "notepad_1",
15+
"viewport": [1920, 1200],
16+
"accessibility_tree": {...},
17+
"step_count": 3
18+
}
19+
Response:
20+
{
21+
"type": "click",
22+
"x": 0.5,
23+
"y": 0.3
24+
}
25+
26+
GET {endpoint_url}/health -> 200 OK
27+
28+
Example:
29+
from openadapt_evals.agents import HttpAgent
30+
31+
agent = HttpAgent(endpoint_url="http://gpu-box:8080")
32+
action = agent.act(observation, task)
33+
"""
34+
35+
from __future__ import annotations
36+
37+
import base64
38+
import logging
39+
from typing import Any
40+
41+
import requests
42+
43+
from openadapt_evals.adapters.base import (
44+
BenchmarkAction,
45+
BenchmarkObservation,
46+
BenchmarkTask,
47+
)
48+
from openadapt_evals.agents.base import BenchmarkAgent
49+
50+
logger = logging.getLogger(__name__)
51+
52+
53+
class HttpAgent(BenchmarkAgent):
54+
"""Agent that delegates to a remote HTTP endpoint.
55+
56+
The remote server receives observation data (screenshot, task,
57+
accessibility tree) and returns an action dict that maps directly
58+
to BenchmarkAction fields.
59+
60+
Args:
61+
endpoint_url: Base URL of the remote agent server (no trailing slash).
62+
timeout: Request timeout in seconds.
63+
headers: Optional extra HTTP headers (e.g. auth tokens).
64+
"""
65+
66+
def __init__(
67+
self,
68+
endpoint_url: str,
69+
timeout: int = 120,
70+
headers: dict[str, str] | None = None,
71+
):
72+
self.endpoint_url = endpoint_url.rstrip("/")
73+
self.timeout = timeout
74+
self.headers = headers or {}
75+
self._step_count = 0
76+
77+
logger.info("HttpAgent initialized: endpoint=%s", self.endpoint_url)
78+
79+
def act(
80+
self,
81+
observation: BenchmarkObservation,
82+
task: BenchmarkTask,
83+
history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
84+
) -> BenchmarkAction:
85+
"""Send observation to remote endpoint, parse response as BenchmarkAction."""
86+
self._step_count += 1
87+
88+
# Encode screenshot
89+
screenshot_b64 = None
90+
if observation.screenshot:
91+
screenshot_b64 = base64.b64encode(observation.screenshot).decode("ascii")
92+
93+
payload: dict[str, Any] = {
94+
"screenshot_b64": screenshot_b64,
95+
"instruction": task.instruction,
96+
"task_id": task.task_id,
97+
"viewport": list(observation.viewport) if observation.viewport else None,
98+
"accessibility_tree": observation.accessibility_tree,
99+
"step_count": self._step_count - 1,
100+
}
101+
102+
try:
103+
resp = requests.post(
104+
f"{self.endpoint_url}/act",
105+
json=payload,
106+
headers=self.headers,
107+
timeout=self.timeout,
108+
)
109+
resp.raise_for_status()
110+
data = resp.json()
111+
except requests.ConnectionError as e:
112+
logger.error("Connection failed: %s", e)
113+
return BenchmarkAction(
114+
type="done",
115+
raw_action={"error": f"connection_failed: {e}"},
116+
)
117+
except requests.Timeout as e:
118+
logger.error("Request timed out: %s", e)
119+
return BenchmarkAction(
120+
type="done",
121+
raw_action={"error": f"timeout: {e}"},
122+
)
123+
except requests.HTTPError as e:
124+
logger.error("HTTP error: %s", e)
125+
return BenchmarkAction(
126+
type="done",
127+
raw_action={"error": f"http_error: {e}"},
128+
)
129+
except (ValueError, KeyError) as e:
130+
logger.error("Invalid response: %s", e)
131+
return BenchmarkAction(
132+
type="done",
133+
raw_action={"error": f"invalid_response: {e}"},
134+
)
135+
136+
return _parse_action_response(data)
137+
138+
def reset(self) -> None:
139+
"""Reset agent state and optionally notify remote endpoint."""
140+
self._step_count = 0
141+
try:
142+
requests.post(
143+
f"{self.endpoint_url}/reset",
144+
headers=self.headers,
145+
timeout=10,
146+
)
147+
except requests.RequestException:
148+
# Remote endpoint may not support /reset — that's fine
149+
pass
150+
151+
def health_check(self) -> bool:
152+
"""Check if the remote endpoint is reachable.
153+
154+
Returns:
155+
True if GET /health returns 200, False otherwise.
156+
"""
157+
try:
158+
resp = requests.get(
159+
f"{self.endpoint_url}/health",
160+
headers=self.headers,
161+
timeout=10,
162+
)
163+
return resp.status_code == 200
164+
except requests.RequestException:
165+
return False
166+
167+
168+
def _parse_action_response(data: dict[str, Any]) -> BenchmarkAction:
169+
"""Convert a response dict into a BenchmarkAction.
170+
171+
The response dict should have at minimum a ``type`` field. All other
172+
fields map directly to BenchmarkAction attributes.
173+
174+
Args:
175+
data: Response dict from the remote agent.
176+
177+
Returns:
178+
Parsed BenchmarkAction.
179+
"""
180+
action_type = data.get("type", "done")
181+
182+
return BenchmarkAction(
183+
type=action_type,
184+
x=data.get("x"),
185+
y=data.get("y"),
186+
target_node_id=data.get("target_node_id"),
187+
target_bbox=tuple(data["target_bbox"]) if data.get("target_bbox") else None,
188+
target_role=data.get("target_role"),
189+
target_name=data.get("target_name"),
190+
text=data.get("text"),
191+
key=data.get("key"),
192+
modifiers=data.get("modifiers"),
193+
scroll_direction=data.get("scroll_direction"),
194+
scroll_amount=data.get("scroll_amount"),
195+
end_x=data.get("end_x"),
196+
end_y=data.get("end_y"),
197+
answer=data.get("answer"),
198+
raw_action=data,
199+
)

0 commit comments

Comments
 (0)