Skip to content

Commit 8f7192c

Browse files
[fix]: make file upload elements more explicit in page snapshot (#1975)
# why - file upload elements sometimes have a 'button' AX role - this causes issues when users prompt `.observe()` to find "file upload elements", since the model sees them as a 'button' - addresses #972 # what changed - this PR preserves the semantics of file upload elements by using the actual DOM tag name and 'type' - eg, instead of `[0-12] button: Choose File`, we now use `[0-12] input, file: Choose File` # test plan - added a unit test for the replacement logic - added an `observe()` eval which checks that hidden file upload elements are correctly found by the LLM <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Make file upload inputs explicit in the page snapshot so `.observe()` can reliably find them. Addresses Linear STG-934 by preventing file inputs from being misclassified as buttons. - **Bug Fixes** - Enrich DOM tag names to include input type (e.g., `input, file`) and use them in DOM maps. - Force file inputs to appear as `input, file` in the a11y outline instead of AX `button`. - Add unit test and fix/register the `observe_file_uploads` eval in `evals.config.json`. - Publish patch for `@browserbasehq/stagehand`. <sup>Written for commit 2fad86f. Summary will update on new commits. <a href="https://cubic.dev/pr/browserbase/stagehand/pull/1975">Review in cubic</a></sup> <!-- End of auto-generated description by cubic. -->
1 parent 68d767a commit 8f7192c

6 files changed

Lines changed: 122 additions & 2 deletions

File tree

.changeset/violet-moons-attend.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
make file upload elements more explicit in page snapshot

packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,12 @@ export function decorateRoles(
130130
: `scrollable${role ? `, ${role}` : ""}`;
131131
}
132132

133+
// File inputs typically get role "button" from Chrome's AX tree;
134+
// override so they appear as "input, file" in the outline.
135+
if (tag === "input, file") {
136+
role = tag;
137+
}
138+
133139
return {
134140
role,
135141
name: n.name?.value,

packages/core/lib/v3/understudy/a11y/snapshot/domTree.ts

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ export async function domMapsForSession(
219219

220220
if (node.backendNodeId) {
221221
const encId = encode(frameId, node.backendNodeId);
222-
tagNameMap[encId] = String(node.nodeName).toLowerCase();
222+
tagNameMap[encId] = enrichedTagName(node);
223223
xpathMap[encId] = xpath || "/";
224224
const isScrollable = node?.isScrollable === true;
225225
if (isScrollable) scrollableMap[encId] = true;
@@ -275,7 +275,7 @@ export async function buildSessionDomIndex(
275275
const { node, xp, docRootBe } = stack.pop()!;
276276
if (node.backendNodeId) {
277277
absByBe.set(node.backendNodeId, xp || "/");
278-
tagByBe.set(node.backendNodeId, String(node.nodeName).toLowerCase());
278+
tagByBe.set(node.backendNodeId, enrichedTagName(node));
279279
if (node?.isScrollable === true) scrollByBe.set(node.backendNodeId, true);
280280
docRootOf.set(node.backendNodeId, docRootBe);
281281
}
@@ -328,6 +328,31 @@ export function relativizeXPath(baseAbs: string, nodeAbs: string): string {
328328
return abs;
329329
}
330330

331+
/**
332+
* Extract an attribute value from a CDP DOM node's flat attributes array.
333+
* Attributes are stored as [name1, value1, name2, value2, ...].
334+
*/
335+
function getAttr(
336+
attrs: string[] | undefined,
337+
name: string,
338+
): string | undefined {
339+
if (!attrs) return undefined;
340+
for (let i = 0; i < attrs.length; i += 2) {
341+
if (attrs[i] === name) return attrs[i + 1];
342+
}
343+
return undefined;
344+
}
345+
346+
/** Build an enriched tag name that includes the type attribute for inputs. */
347+
function enrichedTagName(node: Protocol.DOM.Node): string {
348+
const tag = String(node.nodeName).toLowerCase();
349+
if (tag === "input") {
350+
const type = getAttr(node.attributes, "type");
351+
if (type) return `input, ${type}`;
352+
}
353+
return tag;
354+
}
355+
331356
/** Find a node by backendNodeId inside a DOM.getDocument tree. */
332357
export function findNodeByBackendId(
333358
root: Protocol.DOM.Node,

packages/core/tests/unit/snapshot-a11y-tree-utils.test.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,28 @@ describe("decorateRoles", () => {
8181
]);
8282
});
8383

84+
it("overrides role to 'input, file' for file inputs", () => {
85+
const opts: A11yOptions = {
86+
...defaultOpts,
87+
tagNameMap: { "enc-10": "input, file" },
88+
scrollableMap: {},
89+
};
90+
const nodes = [
91+
makeAxNode({
92+
backendDOMNodeId: 10,
93+
role: axString("button"),
94+
name: axString("Choose File"),
95+
}),
96+
];
97+
98+
const decorated = decorateRoles(nodes, opts);
99+
expect(decorated[0]).toMatchObject({
100+
encodedId: "enc-10",
101+
role: "input, file",
102+
name: "Choose File",
103+
});
104+
});
105+
84106
it("falls back when encoding fails", () => {
85107
const opts: A11yOptions = {
86108
...defaultOpts,

packages/evals/evals.config.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,10 @@
256256
"name": "observe_iframes2",
257257
"categories": ["regression", "observe"]
258258
},
259+
{
260+
"name": "observe_file_uploads",
261+
"categories": ["observe"]
262+
},
259263
{
260264
"name": "extract_hamilton_weather",
261265
"categories": ["targeted_extract", "regression"]
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import { EvalFunction } from "../types/evals.js";
2+
3+
export const observe_file_uploads: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
v3,
7+
logger,
8+
}) => {
9+
try {
10+
const page = v3.context.pages()[0];
11+
await page.goto(
12+
"https://browserbase.github.io/stagehand-eval-sites/sites/file-uploads-3/",
13+
);
14+
15+
const observations = await v3.observe("find the file upload element");
16+
17+
if (observations.length === 0) {
18+
return {
19+
_success: false,
20+
message: "observe returned no results",
21+
observations,
22+
debugUrl,
23+
sessionUrl,
24+
logs: logger.getLogs(),
25+
};
26+
}
27+
28+
const expectedLocator = `xpath=/html/body/input`;
29+
30+
const expectedBackendNodeId = await page
31+
.locator(expectedLocator)
32+
.backendNodeId();
33+
34+
const actualBackendNodeId = await page
35+
.locator(observations[0].selector)
36+
.backendNodeId();
37+
const foundMatch = expectedBackendNodeId === actualBackendNodeId;
38+
39+
return {
40+
_success: foundMatch,
41+
observations,
42+
debugUrl,
43+
sessionUrl,
44+
logs: logger.getLogs(),
45+
};
46+
} catch (error) {
47+
return {
48+
_success: false,
49+
error: error,
50+
message: "returned selector does not resolve to same node as expected",
51+
debugUrl,
52+
sessionUrl,
53+
logs: logger.getLogs(),
54+
};
55+
} finally {
56+
await v3.close();
57+
}
58+
};

0 commit comments

Comments
 (0)