Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions skills/agent-device/references/verification.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ agent-device diff snapshot -i
Use `screenshot` when the proof needs a rendered image instead of a structural tree.

- Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot.
- Use `diff screenshot --baseline <path> --out <diff.png>` when comparing against a saved visual baseline. The text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and a short description so an implementation agent can focus on the biggest visual mismatches instead of a single global pixel percentage. The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined.
- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG.
- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, separators, and card/background movement, not semantic icon recognition.
- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region.

## Session recording

Expand Down
61 changes: 60 additions & 1 deletion src/__tests__/cli-diff.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,25 @@ async function runCliCapture(
fs.mkdirSync(path.dirname(outPath), { recursive: true });
fs.writeFileSync(outPath, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 }));
}
return { ok: true, data: { path: outPath } };
return {
ok: true,
data: {
path: outPath,
...(req.flags?.overlayRefs
? {
overlayRefs: [
{
ref: 'e1',
label: 'Continue',
rect: { x: 1, y: 2, width: 3, height: 4 },
overlayRect: { x: 1, y: 2, width: 3, height: 4 },
center: { x: 3, y: 4 },
},
],
}
: {}),
},
};
}
return {
ok: true,
Expand Down Expand Up @@ -249,11 +267,13 @@ describe('cli diff commands', () => {
'screenshot',
'--baseline',
baseline,
'--overlay-refs',
'--threshold',
'0.2',
]);
assert.equal(result.code, null);
// The client-backed command captures a screenshot via the daemon client
// and skips a second overlay capture when there is no diff to map.
assert.equal(result.calls.length, 1);
const call = result.calls[0]!;
assert.equal(call.command, 'screenshot');
Expand Down Expand Up @@ -321,4 +341,43 @@ describe('cli diff commands', () => {
fs.rmSync(fakeHome, { recursive: true, force: true });
}
});

test('diff screenshot --overlay-refs writes a separate current overlay guide', async () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-'));
const baseline = path.join(dir, 'baseline.png');
const diffOut = path.join(dir, 'diff.png');
const overlayOut = path.join(dir, 'diff.current-overlay.png');
fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 0, g: 0, b: 0 }));

try {
const result = await runCliCapture([
'diff',
'screenshot',
'--baseline',
baseline,
'--out',
diffOut,
'--overlay-refs',
'--threshold',
'0',
]);
assert.equal(result.code, null);
assert.equal(result.calls.length, 2);
assert.equal(result.calls[0]?.command, 'screenshot');
assert.equal(result.calls[0]?.flags?.overlayRefs, undefined);
assert.equal(result.calls[1]?.command, 'screenshot');
assert.equal(result.calls[1]?.flags?.overlayRefs, true);
assert.equal(result.calls[1]?.positionals?.[0], overlayOut);
assert.match(result.stdout, /Diff image:/);
assert.match(result.stdout, /Current overlay:/);
assert.match(result.stdout, /diff\.current-overlay\.png \(1 refs\)/);
assert.match(
result.stdout,
/size=large shape=large-area density=100% boundsPct=0,0,100,100 avgColor=#000000->#ffffff luminance=0->255/,
);
assert.match(result.stdout, /overlaps @e1 "Continue", 12% of region/);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
});
});
23 changes: 23 additions & 0 deletions src/cli/commands/screenshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import path from 'node:path';
import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts';
import { AppError } from '../../utils/errors.ts';
import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts';
import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts';
import { resolveUserPath } from '../../utils/path-resolution.ts';
import { buildSelectionOptions, writeCommandOutput } from './shared.ts';
import type { ClientCommandHandler } from './router.ts';
Expand Down Expand Up @@ -71,6 +72,22 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl
threshold: thresholdNum,
outputPath,
});
if (flags.overlayRefs && !result.match && !result.dimensionMismatch) {
const overlayResult = await client.capture.screenshot({
path: outputPath ? deriveCurrentOverlayPath(outputPath) : undefined,
overlayRefs: true,
});
Comment on lines +75 to +79

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Delete stale current-overlay artifact on no-diff runs

The overlay guide is only captured on mismatch, but there is no cleanup when a later diff screenshot --overlay-refs --out ... run matches (or has a dimension mismatch). That leaves an old *.current-overlay.* file on disk, which can be mistaken for fresh output by users or scripts that rely on the deterministic filename.

Useful? React with 👍 / 👎.

result = {
...result,
currentOverlayPath: overlayResult.path,
...(overlayResult.overlayRefs ? { currentOverlayRefs: overlayResult.overlayRefs } : {}),
...(result.regions && overlayResult.overlayRefs
? {
regions: attachCurrentOverlayMatches(result.regions, overlayResult.overlayRefs),
}
: {}),
};
}
} finally {
try {
fs.unlinkSync(currentPath);
Expand All @@ -83,3 +100,9 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl
writeCommandOutput(flags, result, () => formatScreenshotDiffText(result));
return true;
};

function deriveCurrentOverlayPath(outputPath: string): string {
const extension = path.extname(outputPath);
const base = extension ? outputPath.slice(0, -extension.length) : outputPath;
return `${base}.current-overlay${extension || '.png'}`;
}
111 changes: 111 additions & 0 deletions src/utils/__tests__/output.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -664,11 +664,122 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color'
totalPixels: 10000,
mismatchPercentage: 5,
diffPath: '/tmp/test/diff.png',
currentOverlayPath: '/tmp/test/diff.current-overlay.png',
currentOverlayRefs: [
{
ref: 'e1',
label: 'Continue',
rect: { x: 1, y: 2, width: 3, height: 4 },
overlayRect: { x: 1, y: 2, width: 3, height: 4 },
center: { x: 3, y: 4 },
},
],
regions: [
{
index: 1,
rect: { x: 10, y: 20, width: 100, height: 40 },
center: { x: 60, y: 40 },
normalizedRect: { x: 10, y: 20, width: 100, height: 40 },
differentPixels: 350,
shareOfDiffPercentage: 70,
imagePercentage: 3.5,
densityPercentage: 8.75,
shape: 'horizontal-band',
size: 'medium',
location: 'top-left',
averageBaselineColor: { r: 20, g: 20, b: 20 },
averageCurrentColor: { r: 220, g: 220, b: 220 },
averageBaselineColorHex: '#141414',
averageCurrentColorHex: '#dcdcdc',
baselineLuminance: 20,
currentLuminance: 220,
dominantChange: 'brighter',
description:
"medium region (horizontal-band) in the top-left; 8.75% of this region's pixels differ; current is brighter.",
currentOverlayMatches: [
{
ref: 'e1',
label: 'Continue',
rect: { x: 1, y: 2, width: 3, height: 4 },
overlapPercentage: 100,
regionCoveragePercentage: 12,
},
],
},
],
ocr: {
provider: 'tesseract',
baselineBlocks: 2,
currentBlocks: 2,
matches: [
{
text: 'Wi-Fi',
baselineRect: { x: 120, y: 320, width: 60, height: 22 },
currentRect: { x: 130, y: 332, width: 70, height: 22 },
baselineNormalizedRect: { x: 12, y: 32, width: 6, height: 2.2 },
currentNormalizedRect: { x: 13, y: 33.2, width: 7, height: 2.2 },
delta: { x: 10, y: 12, width: 10, height: 0 },
confidence: 94,
widthRatio: 1.167,
heightRatio: 1,
possibleTextMetricMismatch: true,
description:
'Text "Wi-Fi" moved 10px right, 12px down; text box is 10px wider; possible font, weight, or text rendering mismatch.',
},
],
},
nonTextDeltas: [
{
index: 1,
regionIndex: 1,
slot: 'leading',
likelyKind: 'icon',
rect: { x: 80, y: 318, width: 30, height: 30 },
normalizedRect: { x: 8, y: 31.8, width: 3, height: 3 },
differentPixels: 400,
densityPercentage: 44.44,
nearestText: 'Wi-Fi',
nearestTextDistancePx: 45,
evidence: [
'residual-diff-outside-ocr',
'nearest-text="Wi-Fi"',
'slot=leading',
'shape=icon',
],
},
],
}),
);
assert.match(text, /✗ 5% pixels differ/);
assert.match(text, /Diff image:/);
assert.match(text, /Current overlay:/);
assert.match(text, /diff\.current-overlay\.png \(1 refs\)/);
assert.match(text, /500 different \/ 10000 total pixels/);
assert.match(text, /Changed regions:/);
assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, current is brighter/);
assert.match(
text,
/size=medium shape=horizontal-band density=8\.75% boundsPct=10,20,100,40 avgColor=#141414->#dcdcdc luminance=20->220/,
);
assert.match(text, /overlaps @e1 "Continue", 12% of region/);
assert.match(
text,
/OCR text deltas \(tesseract; baselineBlocks=2 currentBlocks=2; showing 1\/1; px\):/,
);
assert.match(
text,
/item \| text \| movePx \| sizeDeltaPx \| bboxBaseline \| bboxCurrent \| textRatio \| confidence \| issueHint/,
);
assert.match(
text,
/1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| w=1\.167 h=1 \| 94 \| possible-text-metric-mismatch/,
);
assert.match(text, /Non-text visual deltas \(showing 1\/1; px\):/);
assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText \| evidence/);
assert.match(
text,
/1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi" \| residual-diff-outside-ocr,nearest-text="Wi-Fi",slot=leading,shape=icon/,
);
assert.equal(text.includes('\x1b['), false);
});

Expand Down
80 changes: 80 additions & 0 deletions src/utils/__tests__/screenshot-diff-non-text.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import assert from 'node:assert/strict';
import { test } from 'vitest';
import { summarizeNonTextDiffDeltas } from '../screenshot-diff-non-text.ts';

function paintMaskRect(
mask: Uint8Array,
imageWidth: number,
rect: { x: number; y: number; width: number; height: number },
): void {
for (let y = rect.y; y < rect.y + rect.height; y += 1) {
for (let x = rect.x; x < rect.x + rect.width; x += 1) {
mask[y * imageWidth + x] = 1;
}
}
}

test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residuals', () => {
const width = 220;
const height = 120;
const diffMask = new Uint8Array(width * height);
paintMaskRect(diffMask, width, { x: 20, y: 30, width: 20, height: 20 });
paintMaskRect(diffMask, width, { x: 70, y: 32, width: 48, height: 12 });

const deltas = summarizeNonTextDiffDeltas({
diffMask,
width,
height,
regions: [
{
index: 1,
rect: { x: 0, y: 20, width: 180, height: 50 },
center: { x: 90, y: 45 },
normalizedRect: { x: 0, y: 16.67, width: 81.82, height: 41.67 },
differentPixels: 976,
shareOfDiffPercentage: 100,
imagePercentage: 3.7,
densityPercentage: 10.84,
shape: 'horizontal-band',
size: 'medium',
location: 'center',
averageBaselineColor: { r: 0, g: 0, b: 0 },
averageCurrentColor: { r: 255, g: 255, b: 255 },
averageBaselineColorHex: '#000000',
averageCurrentColorHex: '#ffffff',
baselineLuminance: 0,
currentLuminance: 255,
dominantChange: 'brighter',
description: 'test region',
},
],
ocr: {
provider: 'tesseract',
baselineBlocks: 1,
currentBlocks: 1,
baselineBlocksRaw: [
{
text: 'Wi-Fi',
confidence: 90,
rect: { x: 68, y: 28, width: 60, height: 24 },
normalizedRect: { x: 30.91, y: 23.33, width: 27.27, height: 20 },
},
],
currentBlocksRaw: [],
matches: [],
},
});

assert.equal(deltas.length, 1);
assert.equal(deltas[0]?.regionIndex, 1);
assert.equal(deltas[0]?.slot, 'leading');
assert.equal(deltas[0]?.likelyKind, 'icon');
assert.deepEqual(deltas[0]?.rect, { x: 20, y: 30, width: 20, height: 20 });
assert.equal(deltas[0]?.nearestText, 'Wi-Fi');
assert.deepEqual(deltas[0]?.evidence, [
'residual-diff-outside-ocr',
'nearest-text="Wi-Fi"',
'slot=leading',
'shape=icon',
]);
});
Loading
Loading