Skip to content

Commit 0be63f1

Browse files
committed
feat(tools): add browser_wait_for
1 parent 1ebf591 commit 0be63f1

6 files changed

Lines changed: 154 additions & 10 deletions

File tree

crates/rexos-tools/src/browser_bridge.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,41 @@ def main() -> int:
116116
},
117117
}
118118
)
119+
elif action == "WaitFor":
120+
selector = cmd.get("selector", "")
121+
text = cmd.get("text", "")
122+
per_call_timeout = cmd.get("timeout_ms")
123+
try:
124+
per_call_timeout = int(per_call_timeout) if per_call_timeout else timeout_ms
125+
except Exception:
126+
per_call_timeout = timeout_ms
127+
128+
waited_for = {}
129+
if selector:
130+
page.wait_for_selector(selector, timeout=per_call_timeout)
131+
waited_for["selector"] = selector
132+
if text:
133+
page.get_by_text(text, exact=False).first.wait_for(
134+
state="visible", timeout=per_call_timeout
135+
)
136+
waited_for["text"] = text
137+
138+
if not waited_for:
139+
respond({"success": False, "error": "missing selector/text"})
140+
continue
141+
142+
current_url = page.url if page.url else current_url
143+
respond(
144+
{
145+
"success": True,
146+
"data": {
147+
"waited_for": waited_for,
148+
"timeout_ms": per_call_timeout,
149+
"title": page.title(),
150+
"url": current_url,
151+
},
152+
}
153+
)
119154
elif action == "ReadPage":
120155
content = "(empty)"
121156
try:

crates/rexos-tools/src/lib.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ impl Toolset {
9090
browser_click_def(),
9191
browser_type_def(),
9292
browser_press_key_def(),
93+
browser_wait_for_def(),
9394
browser_read_page_def(),
9495
browser_screenshot_def(),
9596
browser_close_def(),
@@ -270,6 +271,16 @@ impl Toolset {
270271
self.browser_press_key(args.selector.as_deref(), &args.key)
271272
.await
272273
}
274+
"browser_wait_for" => {
275+
let args: BrowserWaitForArgs = serde_json::from_str(arguments_json)
276+
.context("parse browser_wait_for arguments")?;
277+
self.browser_wait_for(
278+
args.selector.as_deref(),
279+
args.text.as_deref(),
280+
args.timeout_ms,
281+
)
282+
.await
283+
}
273284
"browser_read_page" => {
274285
let _args: serde_json::Value = serde_json::from_str(arguments_json)
275286
.context("parse browser_read_page arguments")?;
@@ -1314,6 +1325,39 @@ impl Toolset {
13141325
Ok(resp.into_tool_output()?)
13151326
}
13161327

1328+
async fn browser_wait_for(
1329+
&self,
1330+
selector: Option<&str>,
1331+
text: Option<&str>,
1332+
timeout_ms: Option<u64>,
1333+
) -> anyhow::Result<String> {
1334+
if selector.unwrap_or("").trim().is_empty() && text.unwrap_or("").trim().is_empty() {
1335+
bail!("browser_wait_for requires selector or text");
1336+
}
1337+
1338+
let mut guard = self.browser.lock().await;
1339+
let session = guard
1340+
.as_mut()
1341+
.context("browser session not started; call browser_navigate first")?;
1342+
1343+
let mut cmd = serde_json::json!({
1344+
"action": "WaitFor",
1345+
});
1346+
1347+
if let Some(selector) = selector {
1348+
cmd["selector"] = serde_json::Value::String(selector.to_string());
1349+
}
1350+
if let Some(text) = text {
1351+
cmd["text"] = serde_json::Value::String(text.to_string());
1352+
}
1353+
if let Some(timeout_ms) = timeout_ms {
1354+
cmd["timeout_ms"] = serde_json::Value::Number(timeout_ms.into());
1355+
}
1356+
1357+
let resp = session.send(cmd).await?;
1358+
Ok(resp.into_tool_output()?)
1359+
}
1360+
13171361
async fn browser_read_page(&self) -> anyhow::Result<String> {
13181362
let mut guard = self.browser.lock().await;
13191363
let session = guard
@@ -1769,6 +1813,16 @@ struct BrowserPressKeyArgs {
17691813
selector: Option<String>,
17701814
}
17711815

1816+
#[derive(Debug, serde::Deserialize)]
1817+
struct BrowserWaitForArgs {
1818+
#[serde(default)]
1819+
selector: Option<String>,
1820+
#[serde(default)]
1821+
text: Option<String>,
1822+
#[serde(default)]
1823+
timeout_ms: Option<u64>,
1824+
}
1825+
17721826
#[derive(Debug, serde::Deserialize)]
17731827
struct BrowserScreenshotArgs {
17741828
#[serde(default)]
@@ -3236,6 +3290,25 @@ fn browser_press_key_def() -> ToolDefinition {
32363290
}
32373291
}
32383292

3293+
fn browser_wait_for_def() -> ToolDefinition {
3294+
ToolDefinition {
3295+
kind: "function".to_string(),
3296+
function: ToolFunctionDefinition {
3297+
name: "browser_wait_for".to_string(),
3298+
description: "Wait for a selector or text to appear on the page.".to_string(),
3299+
parameters: serde_json::json!({
3300+
"type": "object",
3301+
"properties": {
3302+
"selector": { "type": "string", "description": "Optional CSS selector to wait for." },
3303+
"text": { "type": "string", "description": "Optional visible text to wait for." },
3304+
"timeout_ms": { "type": "integer", "description": "Optional timeout in milliseconds.", "minimum": 1 }
3305+
},
3306+
"additionalProperties": false
3307+
}),
3308+
},
3309+
}
3310+
}
3311+
32393312
fn browser_read_page_def() -> ToolDefinition {
32403313
ToolDefinition {
32413314
kind: "function".to_string(),
@@ -3385,6 +3458,7 @@ mod tests {
33853458
"browser_click",
33863459
"browser_type",
33873460
"browser_press_key",
3461+
"browser_wait_for",
33883462
"browser_read_page",
33893463
"browser_screenshot",
33903464
"browser_close",
@@ -4074,6 +4148,22 @@ mod tests {
40744148
);
40754149
}
40764150

4151+
#[tokio::test]
4152+
async fn browser_wait_for_requires_session() {
4153+
let tmp = tempfile::tempdir().unwrap();
4154+
let tools = Toolset::new(tmp.path().to_path_buf()).unwrap();
4155+
4156+
let err = tools
4157+
.call("browser_wait_for", r#"{ "text": "hello" }"#)
4158+
.await
4159+
.unwrap_err();
4160+
let msg = err.to_string();
4161+
assert!(
4162+
msg.contains("browser_navigate") || msg.contains("session"),
4163+
"{msg}"
4164+
);
4165+
}
4166+
40774167
#[tokio::test]
40784168
async fn browser_read_page_requires_session() {
40794169
let tmp = tempfile::tempdir().unwrap();
@@ -4135,6 +4225,13 @@ mod tests {
41354225
let v: serde_json::Value = serde_json::from_str(&out).unwrap();
41364226
assert_eq!(v["key"], "Enter");
41374227

4228+
let out = tools
4229+
.call("browser_wait_for", r#"{ "text": "hello", "timeout_ms": 1 }"#)
4230+
.await
4231+
.unwrap();
4232+
let v: serde_json::Value = serde_json::from_str(&out).unwrap();
4233+
assert_eq!(v["waited_for"]["text"], "hello");
4234+
41384235
let page = tools.call("browser_read_page", r#"{}"#).await.unwrap();
41394236
let v: serde_json::Value = serde_json::from_str(&page).unwrap();
41404237
assert_eq!(v["title"], "Stub");
@@ -4215,6 +4312,13 @@ for line in sys.stdin:
42154312
resp = {"success": True, "data": {"typed": cmd.get("text", ""), "selector": cmd.get("selector", "")}}
42164313
elif action == "PressKey":
42174314
resp = {"success": True, "data": {"key": cmd.get("key", ""), "selector": cmd.get("selector", "")}}
4315+
elif action == "WaitFor":
4316+
waited_for = {}
4317+
if cmd.get("selector"):
4318+
waited_for["selector"] = cmd.get("selector", "")
4319+
if cmd.get("text"):
4320+
waited_for["text"] = cmd.get("text", "")
4321+
resp = {"success": True, "data": {"waited_for": waited_for, "timeout_ms": cmd.get("timeout_ms")}}
42184322
elif action == "Close":
42194323
resp = {"success": True, "data": {"status": "closed"}}
42204324
sys.stdout.write(json.dumps(resp) + "\n")

docs-site/how-to/browser-automation.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ If your Python executable isn't `python3`, set `REXOS_BROWSER_PYTHON` (example:
1919
- `browser_click` — click by CSS selector (best-effort text fallback)
2020
- `browser_type` — fill an input
2121
- `browser_press_key` — press a key (example: `Enter` to submit a form)
22+
- `browser_wait_for` — wait for a selector/text to appear
2223
- `browser_read_page` — return `{title,url,content}` (content is truncated)
2324
- `browser_screenshot` — write a PNG to a workspace-relative path
2425
- `browser_close` — close the session (idempotent)
@@ -29,8 +30,9 @@ If your Python executable isn't `python3`, set `REXOS_BROWSER_PYTHON` (example:
2930
2. `browser_read_page` to confirm state
3031
3. One small action: `browser_click` or `browser_type`
3132
- If you need to submit a form, use `browser_press_key` with `Enter`.
32-
4. `browser_read_page` again to confirm the page changed
33-
5. Repeat until done, then `browser_screenshot` for evidence and `browser_close`
33+
4. If the page updates async, use `browser_wait_for` (selector/text) to wait for the new state
34+
5. `browser_read_page` again to confirm the page changed
35+
6. Repeat until done, then `browser_screenshot` for evidence and `browser_close`
3436

3537
## Selector tips
3638

@@ -48,10 +50,11 @@ If a CSS selector fails, `browser_click` will try a **best-effort visible-text f
4850
Use this as a starting point for agent prompts:
4951

5052
```text
51-
You may use RexOS browser tools (browser_navigate/click/type/press_key/read_page/screenshot/close).
53+
You may use RexOS browser tools (browser_navigate/click/type/press_key/wait_for/read_page/screenshot/close).
5254
5355
Rules:
54-
- Always call browser_read_page after navigate/click/type to verify page state before the next step.
56+
- Always call browser_read_page after navigate/click/type/press_key to verify page state before the next step.
57+
- If the page updates async, use browser_wait_for (selector/text) before browser_read_page.
5558
- Keep actions minimal and reversible. If selectors fail, read the page and adjust selectors.
5659
- Save a screenshot at the end to .rexos/browser/<topic>.png.
5760
- Do NOT enter credentials or complete purchases without explicit user confirmation.

docs-site/reference/tools.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ By default it rejects loopback/private IPs (basic SSRF protection). For local te
3535

3636
Browser tools enable headless browser automation via a Python Playwright bridge:
3737

38-
- `browser_navigate` / `browser_click` / `browser_type` / `browser_press_key` / `browser_read_page` / `browser_screenshot` / `browser_close`
38+
- `browser_navigate` / `browser_click` / `browser_type` / `browser_press_key` / `browser_wait_for` / `browser_read_page` / `browser_screenshot` / `browser_close`
3939

4040
Notes:
4141

docs-site/zh/how-to/browser-automation.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ python3 -m playwright install chromium
1919
- `browser_click`:按 CSS selector 点击(会做尽力的可见文本 fallback)
2020
- `browser_type`:填写输入框
2121
- `browser_press_key`:按键(例如用 `Enter` 提交表单)
22+
- `browser_wait_for`:等待 selector/text 出现
2223
- `browser_read_page`:返回 `{title,url,content}`(content 会被截断)
2324
- `browser_screenshot`:把 PNG 写入 workspace 相对路径
2425
- `browser_close`:关闭 session(可重复调用)
@@ -29,8 +30,9 @@ python3 -m playwright install chromium
2930
2. `browser_read_page` 确认状态
3031
3. 每次只做一个小动作:`browser_click``browser_type`
3132
- 需要提交表单时,用 `browser_press_key``Enter`
32-
4. 再次 `browser_read_page` 确认页面确实变化
33-
5. 直到完成,最后 `browser_screenshot` 留证并 `browser_close`
33+
4. 如果页面是异步更新,用 `browser_wait_for`(selector/text)等待新状态出现
34+
5. 再次 `browser_read_page` 确认页面确实变化
35+
6. 直到完成,最后 `browser_screenshot` 留证并 `browser_close`
3436

3537
## Selector 小技巧
3638

@@ -46,10 +48,10 @@ python3 -m playwright install chromium
4648
## Prompt 模板(可直接复制)
4749

4850
```text
49-
你可以使用 RexOS 的 browser 工具(browser_navigate/click/type/press_key/read_page/screenshot/close)。
51+
你可以使用 RexOS 的 browser 工具(browser_navigate/click/type/press_key/wait_for/read_page/screenshot/close)。
5052
5153
规则:
52-
- navigate/click/type 之后必须立刻 browser_read_page,先验证页面状态再做下一步
54+
- navigate/click/type/press_key 之后尽快 browser_read_page;如果页面异步更新,先 browser_wait_for 再 read_page
5355
- 动作尽量少且可回滚。selector 失败时先读页面内容,再调整 selector。
5456
- 最后把截图保存到 .rexos/browser/<topic>.png。
5557
- 未经用户明确确认,不要输入账号密码,也不要进行任何付费/下单操作。

docs-site/zh/reference/tools.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ RexOS 会强制超时,并使用尽量最小的环境。
3535

3636
浏览器工具通过 Python Playwright bridge 提供无头浏览器自动化能力:
3737

38-
- `browser_navigate` / `browser_click` / `browser_type` / `browser_press_key` / `browser_read_page` / `browser_screenshot` / `browser_close`
38+
- `browser_navigate` / `browser_click` / `browser_type` / `browser_press_key` / `browser_wait_for` / `browser_read_page` / `browser_screenshot` / `browser_close`
3939

4040
说明:
4141

0 commit comments

Comments
 (0)