Skip to content

Commit e438e6e

Browse files
committed
feat(web-integration): add browser page selection actions
1 parent 5b1b54f commit e438e6e

8 files changed

Lines changed: 411 additions & 8 deletions

File tree

apps/site/docs/en/web-api-reference.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ const agent = new PuppeteerBrowserAgent(browser, page, {
8383
- `newPage()` — Create a new page and make it active.
8484
- `setActivePage(page: Page)` — Explicitly set which Puppeteer page the Browser Agent controls next.
8585
- `waitForNewPage(action?, options?)` — Wait for a newly opened page without implicitly switching the active page.
86+
- AI page selection actions — Browser Agent exposes `ListBrowserPages` and `SetActivePage` in its action space, so AI plans can inspect open tabs and switch the active page by index, title, or URL. These actions are not available on Page Agent.
8687

8788
### Examples
8889

@@ -193,6 +194,7 @@ const agent = new PlaywrightBrowserAgent(context, page, {
193194
- `newPage()` — Create a new page and make it active.
194195
- `setActivePage(page: Page)` — Explicitly set which Playwright page the Browser Agent controls next.
195196
- `waitForNewPage(action?, options?)` — Wait for a newly opened page without implicitly switching the active page.
197+
- AI page selection actions — Browser Agent exposes `ListBrowserPages` and `SetActivePage` in its action space, so AI plans can inspect open tabs and switch the active page by index, title, or URL. These actions are not available on Page Agent.
196198

197199
### Examples
198200

apps/site/docs/zh/web-api-reference.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ const agent = new PuppeteerBrowserAgent(browser, page, {
8383
- `newPage()` —— 创建新页面并将其设为 active page。
8484
- `setActivePage(page: Page)` —— 显式指定 Browser Agent 接下来控制哪个 Puppeteer 页面。
8585
- `waitForNewPage(action?, options?)` —— 等待新打开的页面,但不会隐式切换 active page。
86+
- AI 页面选择 action —— Browser Agent 会在 actionSpace 中暴露 `ListBrowserPages``SetActivePage`,AI 计划可以先查看已打开的标签页,再按 index、title 或 URL 切换 active page。这些 action 不会出现在 Page Agent 上。
8687

8788
### 示例
8889

@@ -193,6 +194,7 @@ const agent = new PlaywrightBrowserAgent(context, page, {
193194
- `newPage()` —— 创建新页面并将其设为 active page。
194195
- `setActivePage(page: Page)` —— 显式指定 Browser Agent 接下来控制哪个 Playwright 页面。
195196
- `waitForNewPage(action?, options?)` —— 等待新打开的页面,但不会隐式切换 active page。
197+
- AI 页面选择 action —— Browser Agent 会在 actionSpace 中暴露 `ListBrowserPages``SetActivePage`,AI 计划可以先查看已打开的标签页,再按 index、title 或 URL 切换 active page。这些 action 不会出现在 Page Agent 上。
196198

197199
### 示例
198200

packages/web-integration/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@
123123
"semver": "7.5.2",
124124
"socket.io": "^4.8.1",
125125
"socket.io-client": "4.8.1",
126-
"ws": "^8.18.1"
126+
"ws": "^8.18.1",
127+
"zod": "^3.25.1"
127128
},
128129
"devDependencies": {
129130
"@playwright/test": "^1.45.0",

packages/web-integration/src/common/browser-agent.ts

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1+
import type { DeviceAction } from '@midscene/core';
12
import type { DebugFunction } from '@midscene/shared/logger';
3+
import { z } from 'zod';
24

35
export type BrowserAgentAdapter<Page, NewPageEvent> = {
46
pages(): Page[] | Promise<Page[]>;
57
newPage(): Promise<Page>;
68
isPageClosed(page: Page): boolean;
79
bringToFront(page: Page): Promise<void> | void;
10+
pageTitle(page: Page): Promise<string> | string;
11+
pageUrl(page: Page): string;
812
onNewPage(handler: (event: NewPageEvent) => void): void;
913
offNewPage(handler: (event: NewPageEvent) => void): void;
1014
resolveNewPage(event: NewPageEvent): Page | Promise<Page | null> | null;
@@ -21,6 +25,51 @@ export type BrowserAgentPageControllerOptions<Page, NewPageEvent> = {
2125
debug: DebugFunction;
2226
};
2327

28+
export type BrowserAgentPageSummary = {
29+
index: number;
30+
active: boolean;
31+
title: string;
32+
url: string;
33+
};
34+
35+
const setActivePageParamSchema = z.object({
36+
index: z
37+
.number()
38+
.int()
39+
.min(0)
40+
.optional()
41+
.describe('0-based page/tab index returned by ListBrowserPages.'),
42+
title: z
43+
.string()
44+
.optional()
45+
.describe('Case-insensitive page title substring to match.'),
46+
url: z
47+
.string()
48+
.optional()
49+
.describe('Case-insensitive page URL substring to match.'),
50+
});
51+
52+
export type SetActivePageParam = z.infer<typeof setActivePageParamSchema>;
53+
54+
const normalizeOptionalText = (value: string | undefined) => {
55+
const trimmed = value?.trim();
56+
return trimmed ? trimmed.toLowerCase() : undefined;
57+
};
58+
59+
const describeSelector = (selector: SetActivePageParam) => {
60+
const parts: string[] = [];
61+
if (selector.index !== undefined) {
62+
parts.push(`index ${selector.index}`);
63+
}
64+
if (selector.title?.trim()) {
65+
parts.push(`title "${selector.title.trim()}"`);
66+
}
67+
if (selector.url?.trim()) {
68+
parts.push(`url "${selector.url.trim()}"`);
69+
}
70+
return parts.join(', ');
71+
};
72+
2473
export class BrowserAgentPageController<Page, NewPageEvent> {
2574
private readonly agentName: string;
2675
private readonly adapter: BrowserAgentAdapter<Page, NewPageEvent>;
@@ -54,6 +103,17 @@ export class BrowserAgentPageController<Page, NewPageEvent> {
54103
return this.adapter.pages();
55104
}
56105

106+
async pageSummaries(): Promise<BrowserAgentPageSummary[]> {
107+
const pages = await this.adapter.pages();
108+
const activePage = this.activePage;
109+
110+
return Promise.all(
111+
pages.map((page, index) =>
112+
this.pageSummary(page, index, page === activePage),
113+
),
114+
);
115+
}
116+
57117
async newPage() {
58118
const page = await this.adapter.newPage();
59119
await this.setActivePage(page);
@@ -75,6 +135,68 @@ export class BrowserAgentPageController<Page, NewPageEvent> {
75135
}
76136
}
77137

138+
async setActivePageBySelector(
139+
selector: SetActivePageParam,
140+
): Promise<BrowserAgentPageSummary> {
141+
const hasIndex = selector.index !== undefined;
142+
const title = normalizeOptionalText(selector.title);
143+
const url = normalizeOptionalText(selector.url);
144+
145+
if (!hasIndex && !title && !url) {
146+
throw new Error(
147+
`[midscene] SetActivePage requires index, title, or url for ${this.agentName}.`,
148+
);
149+
}
150+
151+
const pages = await this.adapter.pages();
152+
153+
if (hasIndex) {
154+
const page = pages[selector.index as number];
155+
if (!page || this.adapter.isPageClosed(page)) {
156+
throw new Error(
157+
`[midscene] Cannot find ${this.agentName} page with index ${selector.index}. Available page indexes: ${pages
158+
.map((_, index) => index)
159+
.join(', ')}`,
160+
);
161+
}
162+
163+
await this.setActivePage(page);
164+
return this.pageSummary(page, selector.index as number, true);
165+
}
166+
167+
const matchedPages: Array<{ page: Page; index: number }> = [];
168+
for (let index = 0; index < pages.length; index++) {
169+
const page = pages[index];
170+
if (this.adapter.isPageClosed(page)) {
171+
continue;
172+
}
173+
174+
const summary = await this.pageSummary(page, index, false);
175+
const matchedTitle =
176+
!title || summary.title.toLowerCase().includes(title);
177+
const matchedUrl = !url || summary.url.toLowerCase().includes(url);
178+
if (matchedTitle && matchedUrl) {
179+
matchedPages.push({ page, index });
180+
}
181+
}
182+
183+
if (matchedPages.length === 0) {
184+
throw new Error(
185+
`[midscene] Cannot find ${this.agentName} page matching ${describeSelector(selector)}.`,
186+
);
187+
}
188+
189+
if (matchedPages.length > 1) {
190+
throw new Error(
191+
`[midscene] Multiple ${this.agentName} pages matched ${describeSelector(selector)}. Use ListBrowserPages and pass an index to SetActivePage.`,
192+
);
193+
}
194+
195+
const { page, index } = matchedPages[0];
196+
await this.setActivePage(page);
197+
return this.pageSummary(page, index, true);
198+
}
199+
78200
async waitForNewPage(
79201
action?: () => Promise<unknown> | unknown,
80202
opts?: { timeout?: number },
@@ -94,6 +216,34 @@ export class BrowserAgentPageController<Page, NewPageEvent> {
94216
this.adapter.offNewPage(this.newPageHandler);
95217
}
96218

219+
private async pageSummary(
220+
page: Page,
221+
index: number,
222+
active: boolean,
223+
): Promise<BrowserAgentPageSummary> {
224+
let title = '';
225+
let url = '';
226+
227+
try {
228+
title = await this.adapter.pageTitle(page);
229+
} catch (error) {
230+
this.debug(`failed to read page title: ${error}`);
231+
}
232+
233+
try {
234+
url = this.adapter.pageUrl(page);
235+
} catch (error) {
236+
this.debug(`failed to read page url: ${error}`);
237+
}
238+
239+
return {
240+
index,
241+
active,
242+
title,
243+
url,
244+
};
245+
}
246+
97247
private async followNewPage(event: NewPageEvent) {
98248
if (!this.isNewPageEvent(event)) {
99249
return;
@@ -165,3 +315,41 @@ export class BrowserAgentPageController<Page, NewPageEvent> {
165315
return { promise, dispose };
166316
}
167317
}
318+
319+
export const createBrowserAgentPageActions = <Page, NewPageEvent>(options: {
320+
agentName: string;
321+
getPageController: () => BrowserAgentPageController<Page, NewPageEvent>;
322+
}): DeviceAction<any>[] => [
323+
{
324+
name: 'ListBrowserPages',
325+
description:
326+
'List all open browser pages/tabs and show which one is currently active. Use this before switching pages when a task refers to another tab or window.',
327+
call: async () => options.getPageController().pageSummaries(),
328+
},
329+
{
330+
name: 'SetActivePage',
331+
description:
332+
'Set the active browser page/tab by 0-based index, title substring, or URL substring. Use index from ListBrowserPages when more than one page could match.',
333+
paramSchema: setActivePageParamSchema,
334+
sample: {
335+
index: 1,
336+
},
337+
call: async (param) =>
338+
options.getPageController().setActivePageBySelector(param),
339+
},
340+
];
341+
342+
export const appendBrowserAgentPageActions = (
343+
customActions: DeviceAction<any>[] | undefined,
344+
browserActions: DeviceAction<any>[],
345+
) => {
346+
if (!customActions?.length) {
347+
return browserActions;
348+
}
349+
350+
const customActionNames = new Set(customActions.map((action) => action.name));
351+
return [
352+
...customActions,
353+
...browserActions.filter((action) => !customActionNames.has(action.name)),
354+
];
355+
};

packages/web-integration/src/playwright/browser-agent.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import {
22
type BrowserAgentAdapter,
33
BrowserAgentPageController,
4+
appendBrowserAgentPageActions,
5+
createBrowserAgentPageActions,
46
} from '@/common/browser-agent';
57
import {
68
applyForceChromeSelectRendering,
@@ -24,6 +26,8 @@ const createPlaywrightBrowserAdapter = (
2426
newPage: () => context.newPage(),
2527
isPageClosed: (page) => page.isClosed(),
2628
bringToFront: (page) => page.bringToFront(),
29+
pageTitle: (page) => page.title(),
30+
pageUrl: (page) => page.url(),
2731
onNewPage: (handler) => context.on('page', handler),
2832
offNewPage: (handler) => context.off('page', handler),
2933
resolveNewPage: (page) => page,
@@ -73,13 +77,32 @@ export class PlaywrightBrowserAgent extends PageAgent<PlaywrightWebPage> {
7377
...agentOpts
7478
} = opts ?? {};
7579
const { forceChromeSelectRendering } = agentOpts;
80+
const pageControllerRef: {
81+
current?: BrowserAgentPageController<PlaywrightPage, PlaywrightPage>;
82+
} = {};
83+
const getPageController = () => {
84+
if (!pageControllerRef.current) {
85+
throw new Error(
86+
'[midscene] PlaywrightBrowserAgent page controller is not initialized.',
87+
);
88+
}
89+
return pageControllerRef.current;
90+
};
91+
const browserActions = createBrowserAgentPageActions({
92+
agentName: 'PlaywrightBrowserAgent',
93+
getPageController,
94+
});
7695
const webPage = new PlaywrightWebPage(initialPage, {
7796
...agentOpts,
7897
forceSameTabNavigation: false,
98+
customActions: appendBrowserAgentPageActions(
99+
agentOpts.customActions,
100+
browserActions,
101+
),
79102
});
80103
super(webPage, agentOpts);
81104

82-
this.pageController = new BrowserAgentPageController({
105+
const pageController = new BrowserAgentPageController({
83106
agentName: 'PlaywrightBrowserAgent',
84107
adapter: createPlaywrightBrowserAdapter(context),
85108
getActivePage: () => this.interface.underlyingPage as PlaywrightPage,
@@ -90,6 +113,8 @@ export class PlaywrightBrowserAgent extends PageAgent<PlaywrightWebPage> {
90113
newPageTimeout,
91114
debug,
92115
});
116+
pageControllerRef.current = pageController;
117+
this.pageController = pageController;
93118

94119
applyForceChromeSelectRendering(
95120
initialPage,

packages/web-integration/src/puppeteer/browser-agent.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import {
22
type BrowserAgentAdapter,
33
BrowserAgentPageController,
4+
appendBrowserAgentPageActions,
5+
createBrowserAgentPageActions,
46
} from '@/common/browser-agent';
57
import {
68
applyForceChromeSelectRendering,
@@ -25,6 +27,8 @@ const createPuppeteerBrowserAdapter = (
2527
newPage: () => browser.newPage(),
2628
isPageClosed: (page) => page.isClosed(),
2729
bringToFront: (page) => page.bringToFront(),
30+
pageTitle: (page) => page.title(),
31+
pageUrl: (page) => page.url(),
2832
onNewPage: (handler) => browser.on('targetcreated', handler),
2933
offNewPage: (handler) => browser.off('targetcreated', handler),
3034
isNewPageEvent: (target) => target.type() === 'page',
@@ -75,13 +79,32 @@ export class PuppeteerBrowserAgent extends PageAgent<PuppeteerWebPage> {
7579
...agentOpts
7680
} = opts ?? {};
7781
const { forceChromeSelectRendering } = agentOpts;
82+
const pageControllerRef: {
83+
current?: BrowserAgentPageController<PuppeteerPage, PuppeteerTarget>;
84+
} = {};
85+
const getPageController = () => {
86+
if (!pageControllerRef.current) {
87+
throw new Error(
88+
'[midscene] PuppeteerBrowserAgent page controller is not initialized.',
89+
);
90+
}
91+
return pageControllerRef.current;
92+
};
93+
const browserActions = createBrowserAgentPageActions({
94+
agentName: 'PuppeteerBrowserAgent',
95+
getPageController,
96+
});
7897
const webPage = new PuppeteerWebPage(initialPage, {
7998
...agentOpts,
8099
forceSameTabNavigation: false,
100+
customActions: appendBrowserAgentPageActions(
101+
agentOpts.customActions,
102+
browserActions,
103+
),
81104
});
82105
super(webPage, agentOpts);
83106

84-
this.pageController = new BrowserAgentPageController({
107+
const pageController = new BrowserAgentPageController({
85108
agentName: 'PuppeteerBrowserAgent',
86109
adapter: createPuppeteerBrowserAdapter(browser),
87110
getActivePage: () => this.interface.underlyingPage as PuppeteerPage,
@@ -92,6 +115,8 @@ export class PuppeteerBrowserAgent extends PageAgent<PuppeteerWebPage> {
92115
newPageTimeout,
93116
debug,
94117
});
118+
pageControllerRef.current = pageController;
119+
this.pageController = pageController;
95120

96121
applyForceChromeSelectRendering(
97122
initialPage,

0 commit comments

Comments
 (0)