Skip to content

Commit 3e0a7b5

Browse files
authored
feat: enrich screenshot diff guidance (#403)
* feat: enrich screenshot diff guidance * fix: address screenshot diff review feedback * docs: organize screenshot diff skill guidance * refactor: prune screenshot diff result metadata * refactor: tighten screenshot diff readout * refactor: generalize screenshot diff heuristics * feat: add compact screenshot diff hints * refactor: prune screenshot diff guidance output
1 parent e31ff32 commit 3e0a7b5

17 files changed

+2445
-30
lines changed

skills/agent-device/references/verification.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Open this file when the task needs evidence, regression checks, replay maintenan
88

99
- `screenshot`
1010
- `diff snapshot`
11+
- `diff screenshot`
1112
- `record`
1213
- `replay -u`
1314
- `perf`
@@ -41,12 +42,27 @@ agent-device diff snapshot -i
4142
- Run `diff snapshot` to confirm the expected structural change.
4243
- Re-run full `snapshot` only when you need fresh refs.
4344

44-
## Visual artifacts
45+
## Screenshot artifacts
4546

4647
Use `screenshot` when the proof needs a rendered image instead of a structural tree.
4748

4849
- Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot.
4950

51+
## Visual regression with diff screenshot
52+
53+
Use `diff screenshot` when comparing the current rendered screen against a saved visual baseline.
54+
55+
```bash
56+
agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png
57+
agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs
58+
```
59+
60+
- Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds.
61+
- The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined.
62+
- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG.
63+
- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition.
64+
- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region.
65+
5066
## Session recording
5167

5268
Use `record` for debugging, documentation, or shareable verification artifacts.

src/__tests__/cli-diff.test.ts

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,25 @@ async function runCliCapture(
9292
fs.mkdirSync(path.dirname(outPath), { recursive: true });
9393
fs.writeFileSync(outPath, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 }));
9494
}
95-
return { ok: true, data: { path: outPath } };
95+
return {
96+
ok: true,
97+
data: {
98+
path: outPath,
99+
...(req.flags?.overlayRefs
100+
? {
101+
overlayRefs: [
102+
{
103+
ref: 'e1',
104+
label: 'Continue',
105+
rect: { x: 1, y: 2, width: 3, height: 4 },
106+
overlayRect: { x: 1, y: 2, width: 3, height: 4 },
107+
center: { x: 3, y: 4 },
108+
},
109+
],
110+
}
111+
: {}),
112+
},
113+
};
96114
}
97115
return {
98116
ok: true,
@@ -249,11 +267,13 @@ describe('cli diff commands', () => {
249267
'screenshot',
250268
'--baseline',
251269
baseline,
270+
'--overlay-refs',
252271
'--threshold',
253272
'0.2',
254273
]);
255274
assert.equal(result.code, null);
256275
// The client-backed command captures a screenshot via the daemon client
276+
// and skips a second overlay capture when there is no diff to map.
257277
assert.equal(result.calls.length, 1);
258278
const call = result.calls[0]!;
259279
assert.equal(call.command, 'screenshot');
@@ -287,12 +307,15 @@ describe('cli diff commands', () => {
287307
const originalHome = process.env.HOME;
288308
const baselineRelative = path.join('fixtures', 'baseline.png');
289309
const diffRelative = path.join('fixtures', 'diff.png');
310+
const overlayRelative = path.join('fixtures', 'diff.current-overlay.png');
290311
const baseline = path.join(fakeHome, baselineRelative);
291312
const diffOut = path.join(fakeHome, diffRelative);
313+
const overlayOut = path.join(fakeHome, overlayRelative);
292314

293315
fs.mkdirSync(path.dirname(baseline), { recursive: true });
294316
fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 }));
295317
fs.writeFileSync(diffOut, 'stale diff');
318+
fs.writeFileSync(overlayOut, 'stale overlay');
296319
process.env.HOME = fakeHome;
297320

298321
try {
@@ -304,6 +327,7 @@ describe('cli diff commands', () => {
304327
`~/${baselineRelative}`,
305328
'--out',
306329
`~/${diffRelative}`,
330+
'--overlay-refs',
307331
'--json',
308332
],
309333
{ preserveHome: true },
@@ -315,10 +339,50 @@ describe('cli diff commands', () => {
315339
assert.equal(payload.success, true);
316340
assert.equal(payload.data.match, true);
317341
assert.equal(fs.existsSync(diffOut), false);
342+
assert.equal(fs.existsSync(overlayOut), false);
318343
} finally {
319344
if (typeof originalHome === 'string') process.env.HOME = originalHome;
320345
else delete process.env.HOME;
321346
fs.rmSync(fakeHome, { recursive: true, force: true });
322347
}
323348
});
349+
350+
test('diff screenshot --overlay-refs writes a separate current overlay guide', async () => {
351+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-'));
352+
const baseline = path.join(dir, 'baseline.png');
353+
const diffOut = path.join(dir, 'diff.png');
354+
const overlayOut = path.join(dir, 'diff.current-overlay.png');
355+
fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 0, g: 0, b: 0 }));
356+
357+
try {
358+
const result = await runCliCapture([
359+
'diff',
360+
'screenshot',
361+
'--baseline',
362+
baseline,
363+
'--out',
364+
diffOut,
365+
'--overlay-refs',
366+
'--threshold',
367+
'0',
368+
]);
369+
assert.equal(result.code, null);
370+
assert.equal(result.calls.length, 2);
371+
assert.equal(result.calls[0]?.command, 'screenshot');
372+
assert.equal(result.calls[0]?.flags?.overlayRefs, undefined);
373+
assert.equal(result.calls[1]?.command, 'screenshot');
374+
assert.equal(result.calls[1]?.flags?.overlayRefs, true);
375+
assert.equal(result.calls[1]?.positionals?.[0], overlayOut);
376+
assert.match(result.stdout, /Diff image:/);
377+
assert.match(result.stdout, /Current overlay:/);
378+
assert.match(result.stdout, /diff\.current-overlay\.png \(1 refs\)/);
379+
assert.match(
380+
result.stdout,
381+
/size=large shape=large-area density=100% avgColor=#000000->#ffffff luminance=0->255/,
382+
);
383+
assert.match(result.stdout, /overlaps @e1 "Continue", 12% of region/);
384+
} finally {
385+
fs.rmSync(dir, { recursive: true, force: true });
386+
}
387+
});
324388
});

src/cli/commands/screenshot.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import path from 'node:path';
44
import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts';
55
import { AppError } from '../../utils/errors.ts';
66
import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts';
7+
import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts';
78
import { resolveUserPath } from '../../utils/path-resolution.ts';
89
import { buildSelectionOptions, writeCommandOutput } from './shared.ts';
910
import type { ClientCommandHandler } from './router.ts';
@@ -71,6 +72,26 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl
7172
threshold: thresholdNum,
7273
outputPath,
7374
});
75+
if (flags.overlayRefs && !result.match && !result.dimensionMismatch) {
76+
const overlayResult = await client.capture.screenshot({
77+
path: outputPath ? deriveCurrentOverlayPath(outputPath) : undefined,
78+
overlayRefs: true,
79+
});
80+
result = {
81+
...result,
82+
currentOverlayPath: overlayResult.path,
83+
...(overlayResult.overlayRefs
84+
? { currentOverlayRefCount: overlayResult.overlayRefs.length }
85+
: {}),
86+
...(result.regions && overlayResult.overlayRefs
87+
? {
88+
regions: attachCurrentOverlayMatches(result.regions, overlayResult.overlayRefs),
89+
}
90+
: {}),
91+
};
92+
} else if (flags.overlayRefs && outputPath) {
93+
removeStaleCurrentOverlay(outputPath);
94+
}
7495
} finally {
7596
try {
7697
fs.unlinkSync(currentPath);
@@ -83,3 +104,21 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl
83104
writeCommandOutput(flags, result, () => formatScreenshotDiffText(result));
84105
return true;
85106
};
107+
108+
function deriveCurrentOverlayPath(outputPath: string): string {
109+
const extension = path.extname(outputPath);
110+
const base = extension ? outputPath.slice(0, -extension.length) : outputPath;
111+
return `${base}.current-overlay${extension || '.png'}`;
112+
}
113+
114+
function removeStaleCurrentOverlay(outputPath: string): void {
115+
try {
116+
fs.unlinkSync(deriveCurrentOverlayPath(outputPath));
117+
} catch (error) {
118+
if (!isFsError(error, 'ENOENT')) throw error;
119+
}
120+
}
121+
122+
function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException {
123+
return typeof error === 'object' && error !== null && 'code' in error && error.code === code;
124+
}

src/utils/__tests__/output.test.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,11 +664,110 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color'
664664
totalPixels: 10000,
665665
mismatchPercentage: 5,
666666
diffPath: '/tmp/test/diff.png',
667+
currentOverlayPath: '/tmp/test/diff.current-overlay.png',
668+
currentOverlayRefCount: 1,
669+
regions: [
670+
{
671+
index: 1,
672+
rect: { x: 10, y: 20, width: 100, height: 40 },
673+
normalizedRect: { x: 10, y: 20, width: 100, height: 40 },
674+
differentPixels: 350,
675+
shareOfDiffPercentage: 70,
676+
densityPercentage: 8.75,
677+
shape: 'horizontal-band',
678+
size: 'medium',
679+
location: 'top-left',
680+
averageBaselineColorHex: '#141414',
681+
averageCurrentColorHex: '#dcdcdc',
682+
baselineLuminance: 20,
683+
currentLuminance: 220,
684+
dominantChange: 'brighter',
685+
currentOverlayMatches: [
686+
{
687+
ref: 'e1',
688+
label: 'Continue',
689+
rect: { x: 1, y: 2, width: 3, height: 4 },
690+
regionCoveragePercentage: 12,
691+
},
692+
],
693+
},
694+
],
695+
ocr: {
696+
provider: 'tesseract',
697+
baselineBlocks: 2,
698+
currentBlocks: 2,
699+
matches: [
700+
{
701+
text: 'Wi-Fi',
702+
baselineRect: { x: 120, y: 320, width: 60, height: 22 },
703+
currentRect: { x: 130, y: 332, width: 70, height: 22 },
704+
delta: { x: 10, y: 12, width: 10, height: 0 },
705+
confidence: 94,
706+
possibleTextMetricMismatch: true,
707+
},
708+
],
709+
movementClusters: [
710+
{
711+
texts: ['Wi-Fi', 'Bluetooth'],
712+
xRange: { min: 10, max: 12 },
713+
yRange: { min: 10, max: 14 },
714+
},
715+
],
716+
},
717+
nonTextDeltas: [
718+
{
719+
index: 1,
720+
regionIndex: 1,
721+
slot: 'leading',
722+
likelyKind: 'icon',
723+
rect: { x: 80, y: 318, width: 30, height: 30 },
724+
nearestText: 'Wi-Fi',
725+
},
726+
{
727+
index: 2,
728+
regionIndex: 1,
729+
slot: 'separator',
730+
likelyKind: 'separator',
731+
rect: { x: 90, y: 360, width: 120, height: 2 },
732+
},
733+
],
667734
}),
668735
);
669736
assert.match(text, / 5% pixels differ/);
670737
assert.match(text, /Diff image:/);
738+
assert.match(text, /Current overlay:/);
739+
assert.match(text, /diff\.current-overlay\.png \(1 refs\)/);
671740
assert.match(text, /500 different \/ 10000 total pixels/);
741+
assert.match(text, /Hints:/);
742+
assert.match(
743+
text,
744+
/text movement cluster: "Wi-Fi", "Bluetooth" dx=\+10\.\.\+12px dy=\+10\.\.\+14px/,
745+
);
746+
assert.match(text, /non-text controls: icon near "Wi-Fi" r1/);
747+
assert.match(text, /non-text boundaries: separator r1/);
748+
assert.match(text, /Changed regions:/);
749+
assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, change=brighter/);
750+
assert.match(
751+
text,
752+
/size=medium shape=horizontal-band density=8\.75% avgColor=#141414->#dcdcdc luminance=20->220/,
753+
);
754+
assert.match(text, /overlaps @e1 "Continue", 12% of region/);
755+
assert.match(
756+
text,
757+
/OCR text deltas \(tesseract; baselineBlocks=2 currentBlocks=2; showing 1\/1; px\):/,
758+
);
759+
assert.match(
760+
text,
761+
/item \| text \| movePx \| sizeDeltaPx \| bboxBaseline \| bboxCurrent \| confidence \| issueHint/,
762+
);
763+
assert.match(
764+
text,
765+
/1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| 94 \| ocr-bbox-size-change/,
766+
);
767+
assert.match(text, /Non-text visual deltas \(showing 2\/2; px\):/);
768+
assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText/);
769+
assert.match(text, /1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi"/);
770+
assert.match(text, /2 \| r1 \| separator \| separator \| x=90,y=360,w=120,h=2 \| -/);
672771
assert.equal(text.includes('\x1b['), false);
673772
});
674773

0 commit comments

Comments
 (0)