Skip to content

Commit 1ebf591

Browse files
committed
feat(tools): add browser_press_key
1 parent 2ae9046 commit 1ebf591

7 files changed

Lines changed: 115 additions & 8 deletions

File tree

crates/rexos-tools/src/browser_bridge.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,32 @@ def main() -> int:
9090
continue
9191
page.fill(selector, text, timeout=timeout_ms)
9292
respond({"success": True, "data": {"selector": selector, "typed": text}})
93+
elif action == "PressKey":
94+
key = cmd.get("key", "")
95+
selector = cmd.get("selector", "")
96+
if not key:
97+
respond({"success": False, "error": "missing key"})
98+
continue
99+
if selector:
100+
page.press(selector, key, timeout=timeout_ms)
101+
else:
102+
page.keyboard.press(key)
103+
try:
104+
page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
105+
except Exception:
106+
pass
107+
current_url = page.url
108+
respond(
109+
{
110+
"success": True,
111+
"data": {
112+
"key": key,
113+
"selector": selector or None,
114+
"title": page.title(),
115+
"url": current_url,
116+
},
117+
}
118+
)
93119
elif action == "ReadPage":
94120
content = "(empty)"
95121
try:
@@ -145,4 +171,3 @@ def respond(obj) -> None:
145171

146172
if __name__ == "__main__":
147173
raise SystemExit(main())
148-

crates/rexos-tools/src/lib.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ impl Toolset {
8989
browser_navigate_def(),
9090
browser_click_def(),
9191
browser_type_def(),
92+
browser_press_key_def(),
9293
browser_read_page_def(),
9394
browser_screenshot_def(),
9495
browser_close_def(),
@@ -263,6 +264,12 @@ impl Toolset {
263264
serde_json::from_str(arguments_json).context("parse browser_type arguments")?;
264265
self.browser_type(&args.selector, &args.text).await
265266
}
267+
"browser_press_key" => {
268+
let args: BrowserPressKeyArgs = serde_json::from_str(arguments_json)
269+
.context("parse browser_press_key arguments")?;
270+
self.browser_press_key(args.selector.as_deref(), &args.key)
271+
.await
272+
}
266273
"browser_read_page" => {
267274
let _args: serde_json::Value = serde_json::from_str(arguments_json)
268275
.context("parse browser_read_page arguments")?;
@@ -1289,6 +1296,24 @@ impl Toolset {
12891296
Ok(resp.into_tool_output()?)
12901297
}
12911298

1299+
async fn browser_press_key(&self, selector: Option<&str>, key: &str) -> anyhow::Result<String> {
1300+
let mut guard = self.browser.lock().await;
1301+
let session = guard
1302+
.as_mut()
1303+
.context("browser session not started; call browser_navigate first")?;
1304+
1305+
let mut cmd = serde_json::json!({
1306+
"action": "PressKey",
1307+
"key": key,
1308+
});
1309+
if let Some(sel) = selector {
1310+
cmd["selector"] = serde_json::Value::String(sel.to_string());
1311+
}
1312+
1313+
let resp = session.send(cmd).await?;
1314+
Ok(resp.into_tool_output()?)
1315+
}
1316+
12921317
async fn browser_read_page(&self) -> anyhow::Result<String> {
12931318
let mut guard = self.browser.lock().await;
12941319
let session = guard
@@ -1737,6 +1762,13 @@ struct BrowserTypeArgs {
17371762
text: String,
17381763
}
17391764

1765+
#[derive(Debug, serde::Deserialize)]
1766+
struct BrowserPressKeyArgs {
1767+
key: String,
1768+
#[serde(default)]
1769+
selector: Option<String>,
1770+
}
1771+
17401772
#[derive(Debug, serde::Deserialize)]
17411773
struct BrowserScreenshotArgs {
17421774
#[serde(default)]
@@ -3184,6 +3216,26 @@ fn browser_type_def() -> ToolDefinition {
31843216
}
31853217
}
31863218

3219+
fn browser_press_key_def() -> ToolDefinition {
3220+
ToolDefinition {
3221+
kind: "function".to_string(),
3222+
function: ToolFunctionDefinition {
3223+
name: "browser_press_key".to_string(),
3224+
description: "Press a key in the browser (optionally on a target element)."
3225+
.to_string(),
3226+
parameters: serde_json::json!({
3227+
"type": "object",
3228+
"properties": {
3229+
"key": { "type": "string", "description": "Key to press (example: Enter, Escape, ArrowDown, Control+A)." },
3230+
"selector": { "type": "string", "description": "Optional CSS selector to target before pressing the key." }
3231+
},
3232+
"required": ["key"],
3233+
"additionalProperties": false
3234+
}),
3235+
},
3236+
}
3237+
}
3238+
31873239
fn browser_read_page_def() -> ToolDefinition {
31883240
ToolDefinition {
31893241
kind: "function".to_string(),
@@ -3332,6 +3384,7 @@ mod tests {
33323384
"browser_navigate",
33333385
"browser_click",
33343386
"browser_type",
3387+
"browser_press_key",
33353388
"browser_read_page",
33363389
"browser_screenshot",
33373390
"browser_close",
@@ -4005,6 +4058,22 @@ mod tests {
40054058
);
40064059
}
40074060

4061+
#[tokio::test]
4062+
async fn browser_press_key_requires_session() {
4063+
let tmp = tempfile::tempdir().unwrap();
4064+
let tools = Toolset::new(tmp.path().to_path_buf()).unwrap();
4065+
4066+
let err = tools
4067+
.call("browser_press_key", r#"{ "key": "Enter" }"#)
4068+
.await
4069+
.unwrap_err();
4070+
let msg = err.to_string();
4071+
assert!(
4072+
msg.contains("browser_navigate") || msg.contains("session"),
4073+
"{msg}"
4074+
);
4075+
}
4076+
40084077
#[tokio::test]
40094078
async fn browser_read_page_requires_session() {
40104079
let tmp = tempfile::tempdir().unwrap();
@@ -4059,6 +4128,13 @@ mod tests {
40594128
.await
40604129
.unwrap();
40614130

4131+
let out = tools
4132+
.call("browser_press_key", r#"{ "key": "Enter" }"#)
4133+
.await
4134+
.unwrap();
4135+
let v: serde_json::Value = serde_json::from_str(&out).unwrap();
4136+
assert_eq!(v["key"], "Enter");
4137+
40624138
let page = tools.call("browser_read_page", r#"{}"#).await.unwrap();
40634139
let v: serde_json::Value = serde_json::from_str(&page).unwrap();
40644140
assert_eq!(v["title"], "Stub");
@@ -4137,6 +4213,8 @@ for line in sys.stdin:
41374213
resp = {"success": True, "data": {"clicked": cmd.get("selector", "")}}
41384214
elif action == "Type":
41394215
resp = {"success": True, "data": {"typed": cmd.get("text", ""), "selector": cmd.get("selector", "")}}
4216+
elif action == "PressKey":
4217+
resp = {"success": True, "data": {"key": cmd.get("key", ""), "selector": cmd.get("selector", "")}}
41404218
elif action == "Close":
41414219
resp = {"success": True, "data": {"status": "closed"}}
41424220
sys.stdout.write(json.dumps(resp) + "\n")

docs-site/how-to/browser-automation.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ If your Python executable isn't `python3`, set `REXOS_BROWSER_PYTHON` (example:
1818
- `browser_navigate` — open a URL (SSRF-protected by default)
1919
- `browser_click` — click by CSS selector (best-effort text fallback)
2020
- `browser_type` — fill an input
21+
- `browser_press_key` — press a key (example: `Enter` to submit a form)
2122
- `browser_read_page` — return `{title,url,content}` (content is truncated)
2223
- `browser_screenshot` — write a PNG to a workspace-relative path
2324
- `browser_close` — close the session (idempotent)
@@ -27,6 +28,7 @@ If your Python executable isn't `python3`, set `REXOS_BROWSER_PYTHON` (example:
2728
1. `browser_navigate` to the entry page
2829
2. `browser_read_page` to confirm state
2930
3. One small action: `browser_click` or `browser_type`
31+
- If you need to submit a form, use `browser_press_key` with `Enter`.
3032
4. `browser_read_page` again to confirm the page changed
3133
5. Repeat until done, then `browser_screenshot` for evidence and `browser_close`
3234

@@ -46,7 +48,7 @@ If a CSS selector fails, `browser_click` will try a **best-effort visible-text f
4648
Use this as a starting point for agent prompts:
4749

4850
```text
49-
You may use RexOS browser tools (browser_navigate/click/type/read_page/screenshot/close).
51+
You may use RexOS browser tools (browser_navigate/click/type/press_key/read_page/screenshot/close).
5052
5153
Rules:
5254
- Always call browser_read_page after navigate/click/type to verify page state before the next step.

docs-site/reference/tools.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ By default it rejects loopback/private IPs (basic SSRF protection). For local te
3535

3636
Browser tools enable headless browser automation via a Python Playwright bridge:
3737

38-
- `browser_navigate` / `browser_click` / `browser_type` / `browser_read_page` / `browser_screenshot` / `browser_close`
38+
- `browser_navigate` / `browser_click` / `browser_type` / `browser_press_key` / `browser_read_page` / `browser_screenshot` / `browser_close`
3939

4040
Notes:
4141

docs-site/zh/how-to/browser-automation.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ python3 -m playwright install chromium
1818
- `browser_navigate`:打开 URL(默认带 SSRF 防护)
1919
- `browser_click`:按 CSS selector 点击(会做尽力的可见文本 fallback)
2020
- `browser_type`:填写输入框
21+
- `browser_press_key`:按键(例如用 `Enter` 提交表单)
2122
- `browser_read_page`:返回 `{title,url,content}`(content 会被截断)
2223
- `browser_screenshot`:把 PNG 写入 workspace 相对路径
2324
- `browser_close`:关闭 session(可重复调用)
@@ -27,6 +28,7 @@ python3 -m playwright install chromium
2728
1. `browser_navigate` 打开入口页面
2829
2. `browser_read_page` 确认状态
2930
3. 每次只做一个小动作:`browser_click``browser_type`
31+
- 需要提交表单时,用 `browser_press_key``Enter`
3032
4. 再次 `browser_read_page` 确认页面确实变化
3133
5. 直到完成,最后 `browser_screenshot` 留证并 `browser_close`
3234

@@ -44,7 +46,7 @@ python3 -m playwright install chromium
4446
## Prompt 模板(可直接复制)
4547

4648
```text
47-
你可以使用 RexOS 的 browser 工具(browser_navigate/click/type/read_page/screenshot/close)。
49+
你可以使用 RexOS 的 browser 工具(browser_navigate/click/type/press_key/read_page/screenshot/close)。
4850
4951
规则:
5052
- navigate/click/type 之后必须立刻 browser_read_page,先验证页面状态再做下一步。

docs-site/zh/reference/tools.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ RexOS 会强制超时,并使用尽量最小的环境。
3535

3636
浏览器工具通过 Python Playwright bridge 提供无头浏览器自动化能力:
3737

38-
- `browser_navigate` / `browser_click` / `browser_type` / `browser_read_page` / `browser_screenshot` / `browser_close`
38+
- `browser_navigate` / `browser_click` / `browser_type` / `browser_press_key` / `browser_read_page` / `browser_screenshot` / `browser_close`
3939

4040
说明:
4141

docs/alignment.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,9 @@
106106

107107
新增:
108108
- `web_fetch`:SSRF 防护(默认拒绝 loopback/private/link-local),支持 `allow_private=true`(便于本地测试/内网)
109-
- 兼容工具面(aliases + stubs):
110-
- 已实现:`file_read/file_write/file_list``apply_patch``shell_exec``web_search``memory_store/memory_recall`
111-
- 已对齐命名但未实现(stub):`agent_*` / `task_*` / `schedule_*` / `knowledge_*` / `cron_*` / `channel_send` / `a2a_*` / `process_*` 等(会返回 not implemented)
109+
- 兼容工具面(aliases + reserved names):
110+
- Toolset:`file_*``apply_patch``shell_exec``web_search``web_fetch``browser_*``a2a_*``docker_exec``process_*``speech_to_text``text_to_speech``canvas_present`
111+
- Runtime:`memory_*``agent_*``hand_*``task_*``event_publish``schedule_*``knowledge_*``cron_*``channel_send`
112112

113113
对应实现:
114114
- `crates/rexos-tools/src/lib.rs``fs_read/fs_write/shell/web_fetch`

0 commit comments

Comments
 (0)