Skip to content

Commit 5bee03d

Browse files
fix: zoom in a specific area
Co-authored-by: Srinivasan Sekar <srinivasan.sekar1990@gmail.com>
1 parent 4f9fc3d commit 5bee03d

3 files changed

Lines changed: 239 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
### Features
44

5-
* add zoom in and out support ([#23](https://github.com/AppiumTestDistribution/AppClaw/issues/23)) ([0e98ec3](https://github.com/AppiumTestDistribution/AppClaw/commit/0e98ec3e5eec4ba3b58460dd67e856f61b4ac717))
5+
- add zoom in and out support ([#23](https://github.com/AppiumTestDistribution/AppClaw/issues/23)) ([0e98ec3](https://github.com/AppiumTestDistribution/AppClaw/commit/0e98ec3e5eec4ba3b58460dd67e856f61b4ac717))
66

77
## [1.2.1](https://github.com/AppiumTestDistribution/AppClaw/compare/v1.2.0...v1.2.1) (2026-04-24)
88

src/explorer/flow-agent.ts

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
/**
2+
* Flow Generator Agent — uses ToolLoopAgent to generate YAML flows one at a time.
3+
*
4+
* Unlike the batch generateFlows() approach (single generateObject call for all flows),
5+
* this agent generates each flow independently via a save_flow tool, giving the LLM
6+
* full context of previously generated flows so each new one is meaningfully different.
7+
*
8+
* Used in Phase 3 of the Explorer pipeline (after PRD analysis and optional crawling).
9+
*/
10+
11+
import { ToolLoopAgent, tool, isLoopFinished, stepCountIs } from 'ai';
12+
import { z } from 'zod';
13+
import type { PRDAnalysis, ScreenGraph, GeneratedFlow, ScreenshotData } from './types.js';
14+
15+
const FLOW_AGENT_INSTRUCTIONS = `You are a mobile test automation expert generating YAML flow files for AppClaw.
16+
17+
Each flow MUST follow this EXACT format:
18+
19+
\`\`\`yaml
20+
# One-line comment describing what this flow does.
21+
name: Descriptive flow name
22+
---
23+
- open <app name> app
24+
- Click on Search Button
25+
- Type "Appium 3.0" in the search bar
26+
- Perform Search
27+
- Scroll down 2 times until TestMu AI is visible
28+
- done: "TestMu AI video for Appium 3.0 on YouTube is visible"
29+
\`\`\`
30+
31+
The YAML has two documents separated by \`---\`:
32+
- Document 1: metadata with \`name:\` field
33+
- Document 2: a list of steps as natural language strings
34+
35+
Supported step patterns (use NATURAL LANGUAGE):
36+
- open <app name> app → Opens the app by name
37+
- Click on <element> → Taps a UI element
38+
- Tap <element> → Same as click
39+
- Type "<text>" in the <field> → Types text. Text MUST be in quotes.
40+
- Perform Search / Submit → Presses Enter/Return
41+
- Scroll down/up → Swipe gesture
42+
- Scroll down N times until "X" is visible → Scroll+assert combo
43+
- wait N s → Wait N seconds
44+
- go back → Navigate back
45+
- assert "X" is visible → Verify text is on screen
46+
- done: "message" → Mark flow complete
47+
48+
CRITICAL FORMAT RULES:
49+
- Type steps MUST always quote the text: Type "search term" in the search bar
50+
- Use natural language for ALL steps — never use structured YAML keys like tap:, type:, wait:
51+
- Each flow MUST start with "open <app name> app"
52+
- Each flow MUST end with done: "description of what was achieved"
53+
- Each flow MUST be a complete, standalone user journey (5–15 steps)
54+
- Flows MUST be diverse — do NOT generate similar flows
55+
56+
Your job:
57+
1. Read the PRD analysis, screen data, and any screenshots in the user message
58+
2. For each of the N flows requested, generate a distinct YAML flow and call save_flow
59+
3. Prioritize high-priority journeys first, then medium, then low
60+
4. If screenshots are provided, look at them carefully — use the EXACT button labels, text, and UI element names you can see
61+
5. If screen graph data is available, use REAL element labels from it
62+
6. Call save_flow once per flow. Stop when the tool returns remaining: 0`;
63+
64+
type ContentPart =
65+
| { type: 'text'; text: string }
66+
| { type: 'image'; image: string; mimeType: string };
67+
68+
function buildPromptParts(
69+
analysis: PRDAnalysis,
70+
numFlows: number,
71+
screenGraph?: ScreenGraph,
72+
screenshots?: ScreenshotData[]
73+
): ContentPart[] {
74+
const journeyContext = analysis.userJourneys
75+
.sort((a, b) => {
76+
const priority = { high: 0, medium: 1, low: 2 };
77+
return priority[a.priority] - priority[b.priority];
78+
})
79+
.map(
80+
(j, i) =>
81+
`${i + 1}. [${j.priority}] ${j.name}: ${j.description}\n Steps: ${j.steps.join(' → ')}`
82+
)
83+
.join('\n');
84+
85+
const featureContext = analysis.features
86+
.map((f) => `- ${f.name}: ${f.description} (elements: ${f.expectedElements.join(', ')})`)
87+
.join('\n');
88+
89+
let screenContext = '';
90+
if (screenGraph && screenGraph.screens.length > 0) {
91+
screenContext = '\n\n## Real Device Screen Data\n';
92+
screenContext += `Discovered ${screenGraph.screens.length} screens with ${screenGraph.transitions.length} transitions.\n\n`;
93+
94+
for (const screen of screenGraph.screens) {
95+
screenContext += `### ${screen.id}\n`;
96+
if (screen.reachedVia) {
97+
screenContext += `Reached via: ${screen.reachedVia.action} from ${screen.reachedVia.fromScreen}\n`;
98+
}
99+
screenContext += `Visible texts: ${screen.visibleTexts.slice(0, 20).join(', ')}\n`;
100+
screenContext += `Tappable elements: ${screen.tappableElements.map((e) => `"${e.label}" (${e.type})`).join(', ')}\n\n`;
101+
}
102+
103+
screenContext += '### Navigation Paths\n';
104+
for (const t of screenGraph.transitions) {
105+
screenContext += `- ${t.fromScreen} → tap "${t.element}" → ${t.toScreen}\n`;
106+
}
107+
108+
screenContext +=
109+
'\nIMPORTANT: Use the REAL element labels from screen data above — they are the actual UI labels on the device.';
110+
}
111+
112+
const hasScreenshots = screenshots && screenshots.length > 0;
113+
114+
const textContent = `Generate exactly ${numFlows} YAML test flows for this mobile app. Call save_flow once for each flow.
115+
116+
## App: ${analysis.appName}
117+
${analysis.appId ? `Package: ${analysis.appId}` : ''}
118+
Platform: ${analysis.platform}
119+
120+
## Features
121+
${featureContext}
122+
123+
## User Journeys (prioritized — pick the top ${numFlows})
124+
${journeyContext}
125+
${screenContext}
126+
${hasScreenshots ? `\n## App Screenshots (${screenshots!.length} provided)\nThe screenshots below show the actual app UI. Use the EXACT button labels, text, and element names visible in them when writing flow steps.` : ''}
127+
128+
Generate ${numFlows} diverse flows. Include a mix of core happy-path flows, secondary feature flows, and at least one edge case if applicable.`;
129+
130+
const parts: ContentPart[] = [{ type: 'text', text: textContent }];
131+
132+
if (hasScreenshots) {
133+
for (const shot of screenshots!) {
134+
// Label each image so the agent knows which screen it's looking at
135+
parts.push({ type: 'text', text: `[Screenshot: ${shot.filename}]` });
136+
parts.push({ type: 'image', image: shot.base64, mimeType: shot.mimeType });
137+
}
138+
}
139+
140+
return parts;
141+
}
142+
143+
function buildYaml(name: string, comment: string, steps: string[]): string {
144+
const lines: string[] = [`# ${comment}`, `name: ${name}`, '---'];
145+
for (const step of steps) {
146+
lines.push(`- ${step}`);
147+
}
148+
const lastStep = steps[steps.length - 1];
149+
if (!lastStep?.toLowerCase().startsWith('done')) {
150+
lines.push('- done');
151+
}
152+
return lines.join('\n');
153+
}
154+
155+
/**
156+
* Generate YAML flows using a ToolLoopAgent.
157+
*
158+
* The agent generates flows one at a time by calling save_flow for each,
159+
* giving it full context of what it has already generated so each new
160+
* flow is meaningfully different.
161+
*/
162+
export async function generateFlowsWithAgent(
163+
analysis: PRDAnalysis,
164+
numFlows: number,
165+
model: any,
166+
providerOptions?: Record<string, any>,
167+
screenGraph?: ScreenGraph,
168+
screenshots?: ScreenshotData[]
169+
): Promise<GeneratedFlow[]> {
170+
const generatedFlows: GeneratedFlow[] = [];
171+
172+
const agent = new ToolLoopAgent({
173+
model,
174+
instructions: FLOW_AGENT_INSTRUCTIONS,
175+
tools: {
176+
save_flow: tool({
177+
description:
178+
'Save a generated YAML test flow. Call this once per flow after composing its name, comment, and steps.',
179+
inputSchema: z.object({
180+
name: z
181+
.string()
182+
.describe(
183+
"Descriptive flow name (e.g. 'YouTube — search Appium 3.0 and verify TestMu AI video')"
184+
),
185+
comment: z.string().describe('One-line comment placed at the top of the YAML file'),
186+
journey: z.string().describe('Which user journey this flow covers'),
187+
steps: z
188+
.array(z.string())
189+
.describe('Ordered natural language steps including the final done: step'),
190+
}),
191+
execute: async ({ name, comment, journey, steps }) => {
192+
generatedFlows.push({
193+
name,
194+
description: comment,
195+
yamlContent: buildYaml(name, comment, steps),
196+
journey,
197+
});
198+
199+
const remaining = numFlows - generatedFlows.length;
200+
return {
201+
saved: true,
202+
flowsGenerated: generatedFlows.length,
203+
remaining,
204+
};
205+
},
206+
}),
207+
},
208+
// Stop when the agent makes a step with no tool calls (it's done)
209+
// or after a hard cap to prevent runaway loops
210+
stopWhen: [isLoopFinished(), stepCountIs(numFlows + 3)],
211+
...(providerOptions ? { providerOptions } : {}),
212+
});
213+
214+
const parts = buildPromptParts(analysis, numFlows, screenGraph, screenshots);
215+
216+
// Use multimodal message format when screenshots are present, plain string otherwise
217+
const prompt =
218+
parts.length === 1 && parts[0].type === 'text'
219+
? parts[0].text
220+
: [{ role: 'user' as const, content: parts }];
221+
222+
await agent.generate({ prompt });
223+
224+
return generatedFlows;
225+
}

src/flow/run-yaml-flow.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,10 +1055,19 @@ export async function executeStep(
10551055
}
10561056

10571057
let pinchArgs: Record<string, unknown> = { action: 'pinch_zoom', scale: step.scale };
1058-
if (elementUUID && !isAIElement(elementUUID)) {
1059-
// Only pass real Appium element UUIDs — ai-element: synthetic UUIDs are not
1060-
// in Appium's element cache and will cause a 404 in the pinch handler.
1061-
pinchArgs.elementUUID = elementUUID;
1058+
if (elementUUID) {
1059+
if (isAIElement(elementUUID)) {
1060+
// ai-element: UUIDs are not in Appium's cache — extract the center coordinates
1061+
// and pass them as x,y so the pinch is centered on the located element.
1062+
const coords = parseAIElementCoords(elementUUID);
1063+
if (coords) {
1064+
pinchArgs.x = coords.x;
1065+
pinchArgs.y = coords.y;
1066+
}
1067+
} else {
1068+
// Real Appium UUID: pass directly for element-bounds-based pinch.
1069+
pinchArgs.elementUUID = elementUUID;
1070+
}
10621071
}
10631072

10641073
const zoomResult = await mcp.callTool('appium_gesture', pinchArgs);

0 commit comments

Comments
 (0)