-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathtest_openenv_browsergym_basic.py
More file actions
100 lines (85 loc) · 3.65 KB
/
Copy pathtest_openenv_browsergym_basic.py
File metadata and controls
100 lines (85 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import asyncio
import os
import shutil
from typing import Any, Dict, List
import pytest
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest.types import RolloutProcessorConfig
from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor
# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
@pytest.mark.integration
def test_openenv_browsergym_basic():
"""
Very basic integration test to ensure OpenEnv + BrowserGym can run a single-step rollout.
Skips automatically if Docker is not available.
"""
if shutil.which("docker") is None:
pytest.skip("Docker not available on PATH; skipping OpenEnv BrowserGym basic test.")
# Build a minimal EvaluationRow (messages can be empty; processor will add user prompts)
rows: List[EvaluationRow] = [EvaluationRow(messages=[Message(role="user", content="start")])]
# Use tasks that are known to exist; requires MiniWoB server reachable from containers.
tasks = ["click-test"]
miniwob_url = os.getenv("MINIWOB_URL", "http://172.17.0.1:8888/miniwob/")
browsergym_env_vars: Dict[str, str] = {
"BROWSERGYM_BENCHMARK": "miniwob",
"BROWSERGYM_HEADLESS": "true",
"BROWSERGYM_VIEWPORT_WIDTH": "1280",
"BROWSERGYM_VIEWPORT_HEIGHT": "720",
"BROWSERGYM_TIMEOUT": "10000",
"BROWSERGYM_OBS_AXTREE": "1",
"BROWSERGYM_OBS_PRUNED_HTML": "1",
"BROWSERGYM_RETURN_INFO": "1",
}
if miniwob_url:
browsergym_env_vars["MINIWOB_URL"] = miniwob_url
# Construct the processor with a trivial action_parser; the model output will still be generated
# but we parse to a safe noop action to minimize flakiness for the environment step.
from envs.browsergym_env import BrowserGymAction, BrowserGymEnv # type: ignore
processor = OpenEnvRolloutProcessor(
env_factory=None,
prompt_builder=lambda obs, step, history: "Do nothing",
action_parser=lambda text: BrowserGymAction(action_str="noop()"),
env_client_cls=BrowserGymEnv,
tasks=tasks,
task_var="BROWSERGYM_TASK_NAME",
miniwob_url=miniwob_url,
docker_image="browsergym-env:latest",
benchmark="miniwob",
timeout_ms=10000,
num_generations=1,
env_vars=browsergym_env_vars,
)
# Completion params: rely on an available provider/model in the environment
completion_params: Dict[str, Any] = {
"model": os.getenv(
"OPENENV_TEST_MODEL",
# Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
"fireworks_ai/accounts/fireworks/models/kimi-k2p5",
),
"temperature": 0.0,
"max_tokens": 16,
"reasoning_effort": "none",
}
# Limit to a single step to keep the test fast and robust
config = RolloutProcessorConfig(
completion_params=completion_params,
semaphore=asyncio.Semaphore(1),
steps=1,
mcp_config_path="",
)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
async def _run_all():
tasks_ = processor(rows, config)
return await asyncio.gather(*tasks_)
completed_rows = loop.run_until_complete(_run_all())
finally:
loop.close()
assert len(completed_rows) == 1
# Basic sanity checks that a rollout happened and usage is populated
row = completed_rows[0]
assert row is not None
assert row.execution_metadata is not None
assert getattr(row.execution_metadata, "duration_seconds", 0.0) >= 0.0