Skip to content

Commit ba6ef85

Browse files
fix: improve locator scoring logic
fix: improve locator scoring logic
2 parents d2bfcda + 855f9d6 commit ba6ef85

3 files changed

Lines changed: 67 additions & 6 deletions

File tree

src/agent/element-finder.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@ export async function findByIdStrategies(
5656
if (uuid) return uuid;
5757
}
5858

59-
// 3. Try finding by visible text (xpath) — useful when id/accessibilityId are generic
59+
// 3. Try finding by visible text (xpath) — [1] ensures the first DOM match when duplicates exist
6060
if (text) {
6161
const escapedText = text.replace(/'/g, "\\'");
62-
const uuid = await findElement(mcp, 'xpath', `//*[@text='${escapedText}']`).catch(() => null);
62+
const uuid = await findElement(mcp, 'xpath', `(//*[@text='${escapedText}'])[1]`).catch(() => null);
6363
if (uuid) return uuid;
6464
}
6565

src/llm/prompts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ HOW TO INTERACT (DOM MODE)
3636
Example: strategy="id", selector="com.google.android.gm:id/compose_button"
3737
3838
**How to pick:** Look at the DOM element:
39+
- Has xpath="..."? → Use **xpath** with that exact value (it uniquely identifies the element when duplicates exist)
3940
- Has desc="X"? → Use accessibility id with "X"
4041
- Has rid="Y"? → Use id with "Y"
4142
- Has only text="Z"? → Use accessibility id with "Z"

src/perception/dom-trimmer.ts

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@ interface TrimmedNode {
1414
tag: string;
1515
attrs: Record<string, string>;
1616
score: number;
17+
platform: 'android' | 'ios';
18+
/** 1-based position among all elements with the same primary selector (only set for duplicates) */
19+
domPos?: number;
20+
/** The xpath attribute name used to build the positional xpath ('content-desc', 'text', 'name', 'label') */
21+
xpathAttrName?: string;
22+
/** The selector value used for the positional xpath */
23+
xpathKey?: string;
1724
}
1825

1926
/** Result of trimDOM — compact XML plus pre-computed element counts. */
@@ -60,6 +67,25 @@ export function trimDOM(
6067
walkIOS(parsed, nodes);
6168
}
6269

70+
// Detect duplicate elements (same primary selector in DOM order) and annotate with
71+
// positional xpath so the LLM can precisely target a specific occurrence.
72+
const keyCount = new Map<string, number>();
73+
for (const node of nodes) {
74+
const sel = getPrimarySelector(node);
75+
if (sel) keyCount.set(sel.key, (keyCount.get(sel.key) ?? 0) + 1);
76+
}
77+
const keyPos = new Map<string, number>();
78+
for (const node of nodes) {
79+
const sel = getPrimarySelector(node);
80+
if (sel && (keyCount.get(sel.key) ?? 0) > 1) {
81+
const pos = (keyPos.get(sel.key) ?? 0) + 1;
82+
keyPos.set(sel.key, pos);
83+
node.domPos = pos;
84+
node.xpathAttrName = sel.attrName;
85+
node.xpathKey = sel.key;
86+
}
87+
}
88+
6389
// Sort by relevance score and take top N
6490
nodes.sort((a, b) => b.score - a.score);
6591
const top = nodes.slice(0, maxElements);
@@ -75,10 +101,15 @@ export function trimDOM(
75101

76102
// Build compact XML with element numbering
77103
const lines = top.map((node, i) => {
78-
const attrs = Object.entries(node.attrs)
104+
const attrs = { ...node.attrs };
105+
// Add positional xpath for duplicate elements so the LLM can select precisely
106+
if (node.domPos !== undefined && node.xpathKey && node.xpathAttrName) {
107+
attrs.xpath = `(//*[@${node.xpathAttrName}=${xpathString(node.xpathKey)}])[${node.domPos}]`;
108+
}
109+
const attrStr = Object.entries(attrs)
79110
.map(([k, v]) => `${k}="${escapeXml(v)}"`)
80111
.join(' ');
81-
return `<${node.tag} idx="${i + 1}" ${attrs}/>`;
112+
return `<${node.tag} idx="${i + 1}" ${attrStr}/>`;
82113
});
83114

84115
return {
@@ -182,7 +213,7 @@ function walkAndroid(node: any, result: TrimmedNode[], parentContext: string = '
182213
attrs.in = parentContext;
183214
}
184215

185-
result.push({ tag, attrs, score });
216+
result.push({ tag, attrs, score, platform: 'android' });
186217
}
187218

188219
walkChildrenAndroid(node, result, childContext);
@@ -278,7 +309,7 @@ function walkIOS(node: any, result: TrimmedNode[], parentContext: string = ''):
278309
attrs.in = parentContext;
279310
}
280311

281-
result.push({ tag, attrs, score });
312+
result.push({ tag, attrs, score, platform: 'ios' });
282313
}
283314

284315
walkChildrenIOS(node, result, childContext);
@@ -309,3 +340,32 @@ function escapeXml(str: string): string {
309340
.replace(/</g, '&lt;')
310341
.replace(/>/g, '&gt;');
311342
}
343+
344+
/**
345+
* Return the primary selector key and the corresponding raw Appium XML attribute name
346+
* for a trimmed node. Used to detect and annotate duplicate elements.
347+
*/
348+
function getPrimarySelector(
349+
node: TrimmedNode
350+
): { key: string; attrName: string } | null {
351+
if (node.platform === 'android') {
352+
if (node.attrs.desc) return { key: node.attrs.desc, attrName: 'content-desc' };
353+
if (node.attrs.text) return { key: node.attrs.text, attrName: 'text' };
354+
} else {
355+
if (node.attrs.name) return { key: node.attrs.name, attrName: 'name' };
356+
if (node.attrs.text) return { key: node.attrs.text, attrName: 'label' };
357+
}
358+
return null;
359+
}
360+
361+
/**
362+
* Produce a quoted xpath string literal, handling values that contain single quotes.
363+
* xpath 1.0 has no escape sequence for quotes, so we use concat() when needed.
364+
*/
365+
function xpathString(value: string): string {
366+
if (!value.includes("'")) return `'${value}'`;
367+
if (!value.includes('"')) return `"${value}"`;
368+
// Contains both quote types: split around single quotes and concat
369+
const parts = value.split("'").map((p) => `'${p}'`).join(`, "'", `);
370+
return `concat(${parts})`;
371+
}

0 commit comments

Comments
 (0)