From e72b2664a4cc4e18b5d55486c65a018cf7e41dc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 11:13:46 +0200 Subject: [PATCH 1/8] feat: enrich screenshot diff guidance --- .../agent-device/references/verification.md | 4 + src/__tests__/cli-diff.test.ts | 61 ++- src/cli/commands/screenshot.ts | 23 ++ src/utils/__tests__/output.test.ts | 111 ++++++ .../screenshot-diff-non-text.test.ts | 80 ++++ .../__tests__/screenshot-diff-ocr.test.ts | 71 ++++ src/utils/__tests__/screenshot-diff.test.ts | 120 +++++- src/utils/command-schema.ts | 6 +- src/utils/output.ts | 119 ++++++ src/utils/screenshot-diff-non-text.ts | 364 +++++++++++++++++ src/utils/screenshot-diff-ocr.ts | 373 ++++++++++++++++++ src/utils/screenshot-diff-overlay-matches.ts | 61 +++ src/utils/screenshot-diff-region-overlay.ts | 53 +++ src/utils/screenshot-diff-region-split.ts | 171 ++++++++ src/utils/screenshot-diff-regions.ts | 370 +++++++++++++++++ src/utils/screenshot-diff.ts | 97 ++++- website/docs/docs/commands.md | 6 + 17 files changed, 2061 insertions(+), 29 deletions(-) create mode 100644 src/utils/__tests__/screenshot-diff-non-text.test.ts create mode 100644 src/utils/__tests__/screenshot-diff-ocr.test.ts create mode 100644 src/utils/screenshot-diff-non-text.ts create mode 100644 src/utils/screenshot-diff-ocr.ts create mode 100644 src/utils/screenshot-diff-overlay-matches.ts create mode 100644 src/utils/screenshot-diff-region-overlay.ts create mode 100644 src/utils/screenshot-diff-region-split.ts create mode 100644 src/utils/screenshot-diff-regions.ts diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 8a38ad4d2..3e0be202e 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -46,6 +46,10 @@ agent-device diff snapshot -i Use `screenshot` when the proof needs a rendered image instead of a structural tree. - Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot. +- Use `diff screenshot --baseline --out ` when comparing against a saved visual baseline. The text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and a short description so an implementation agent can focus on the biggest visual mismatches instead of a single global pixel percentage. The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. +- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. +- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, separators, and card/background movement, not semantic icon recognition. +- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. ## Session recording diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index 41e824ff7..74682fa27 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -92,7 +92,25 @@ async function runCliCapture( fs.mkdirSync(path.dirname(outPath), { recursive: true }); fs.writeFileSync(outPath, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 })); } - return { ok: true, data: { path: outPath } }; + return { + ok: true, + data: { + path: outPath, + ...(req.flags?.overlayRefs + ? { + overlayRefs: [ + { + ref: 'e1', + label: 'Continue', + rect: { x: 1, y: 2, width: 3, height: 4 }, + overlayRect: { x: 1, y: 2, width: 3, height: 4 }, + center: { x: 3, y: 4 }, + }, + ], + } + : {}), + }, + }; } return { ok: true, @@ -249,11 +267,13 @@ describe('cli diff commands', () => { 'screenshot', '--baseline', baseline, + '--overlay-refs', '--threshold', '0.2', ]); assert.equal(result.code, null); // The client-backed command captures a screenshot via the daemon client + // and skips a second overlay capture when there is no diff to map. assert.equal(result.calls.length, 1); const call = result.calls[0]!; assert.equal(call.command, 'screenshot'); @@ -321,4 +341,43 @@ describe('cli diff commands', () => { fs.rmSync(fakeHome, { recursive: true, force: true }); } }); + + test('diff screenshot --overlay-refs writes a separate current overlay guide', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-')); + const baseline = path.join(dir, 'baseline.png'); + const diffOut = path.join(dir, 'diff.png'); + const overlayOut = path.join(dir, 'diff.current-overlay.png'); + fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 0, g: 0, b: 0 })); + + try { + const result = await runCliCapture([ + 'diff', + 'screenshot', + '--baseline', + baseline, + '--out', + diffOut, + '--overlay-refs', + '--threshold', + '0', + ]); + assert.equal(result.code, null); + assert.equal(result.calls.length, 2); + assert.equal(result.calls[0]?.command, 'screenshot'); + assert.equal(result.calls[0]?.flags?.overlayRefs, undefined); + assert.equal(result.calls[1]?.command, 'screenshot'); + assert.equal(result.calls[1]?.flags?.overlayRefs, true); + assert.equal(result.calls[1]?.positionals?.[0], overlayOut); + assert.match(result.stdout, /Diff image:/); + assert.match(result.stdout, /Current overlay:/); + assert.match(result.stdout, /diff\.current-overlay\.png \(1 refs\)/); + assert.match( + result.stdout, + /size=large shape=large-area density=100% boundsPct=0,0,100,100 avgColor=#000000->#ffffff luminance=0->255/, + ); + assert.match(result.stdout, /overlaps @e1 "Continue", 12% of region/); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); }); diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index d7cd01c7f..7016e0170 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -4,6 +4,7 @@ import path from 'node:path'; import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts'; import { AppError } from '../../utils/errors.ts'; import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts'; +import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts'; import { resolveUserPath } from '../../utils/path-resolution.ts'; import { buildSelectionOptions, writeCommandOutput } from './shared.ts'; import type { ClientCommandHandler } from './router.ts'; @@ -71,6 +72,22 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl threshold: thresholdNum, outputPath, }); + if (flags.overlayRefs && !result.match && !result.dimensionMismatch) { + const overlayResult = await client.capture.screenshot({ + path: outputPath ? deriveCurrentOverlayPath(outputPath) : undefined, + overlayRefs: true, + }); + result = { + ...result, + currentOverlayPath: overlayResult.path, + ...(overlayResult.overlayRefs ? { currentOverlayRefs: overlayResult.overlayRefs } : {}), + ...(result.regions && overlayResult.overlayRefs + ? { + regions: attachCurrentOverlayMatches(result.regions, overlayResult.overlayRefs), + } + : {}), + }; + } } finally { try { fs.unlinkSync(currentPath); @@ -83,3 +100,9 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl writeCommandOutput(flags, result, () => formatScreenshotDiffText(result)); return true; }; + +function deriveCurrentOverlayPath(outputPath: string): string { + const extension = path.extname(outputPath); + const base = extension ? outputPath.slice(0, -extension.length) : outputPath; + return `${base}.current-overlay${extension || '.png'}`; +} diff --git a/src/utils/__tests__/output.test.ts b/src/utils/__tests__/output.test.ts index 715e802d3..002951768 100644 --- a/src/utils/__tests__/output.test.ts +++ b/src/utils/__tests__/output.test.ts @@ -664,11 +664,122 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' totalPixels: 10000, mismatchPercentage: 5, diffPath: '/tmp/test/diff.png', + currentOverlayPath: '/tmp/test/diff.current-overlay.png', + currentOverlayRefs: [ + { + ref: 'e1', + label: 'Continue', + rect: { x: 1, y: 2, width: 3, height: 4 }, + overlayRect: { x: 1, y: 2, width: 3, height: 4 }, + center: { x: 3, y: 4 }, + }, + ], + regions: [ + { + index: 1, + rect: { x: 10, y: 20, width: 100, height: 40 }, + center: { x: 60, y: 40 }, + normalizedRect: { x: 10, y: 20, width: 100, height: 40 }, + differentPixels: 350, + shareOfDiffPercentage: 70, + imagePercentage: 3.5, + densityPercentage: 8.75, + shape: 'horizontal-band', + size: 'medium', + location: 'top-left', + averageBaselineColor: { r: 20, g: 20, b: 20 }, + averageCurrentColor: { r: 220, g: 220, b: 220 }, + averageBaselineColorHex: '#141414', + averageCurrentColorHex: '#dcdcdc', + baselineLuminance: 20, + currentLuminance: 220, + dominantChange: 'brighter', + description: + "medium region (horizontal-band) in the top-left; 8.75% of this region's pixels differ; current is brighter.", + currentOverlayMatches: [ + { + ref: 'e1', + label: 'Continue', + rect: { x: 1, y: 2, width: 3, height: 4 }, + overlapPercentage: 100, + regionCoveragePercentage: 12, + }, + ], + }, + ], + ocr: { + provider: 'tesseract', + baselineBlocks: 2, + currentBlocks: 2, + matches: [ + { + text: 'Wi-Fi', + baselineRect: { x: 120, y: 320, width: 60, height: 22 }, + currentRect: { x: 130, y: 332, width: 70, height: 22 }, + baselineNormalizedRect: { x: 12, y: 32, width: 6, height: 2.2 }, + currentNormalizedRect: { x: 13, y: 33.2, width: 7, height: 2.2 }, + delta: { x: 10, y: 12, width: 10, height: 0 }, + confidence: 94, + widthRatio: 1.167, + heightRatio: 1, + possibleTextMetricMismatch: true, + description: + 'Text "Wi-Fi" moved 10px right, 12px down; text box is 10px wider; possible font, weight, or text rendering mismatch.', + }, + ], + }, + nonTextDeltas: [ + { + index: 1, + regionIndex: 1, + slot: 'leading', + likelyKind: 'icon', + rect: { x: 80, y: 318, width: 30, height: 30 }, + normalizedRect: { x: 8, y: 31.8, width: 3, height: 3 }, + differentPixels: 400, + densityPercentage: 44.44, + nearestText: 'Wi-Fi', + nearestTextDistancePx: 45, + evidence: [ + 'residual-diff-outside-ocr', + 'nearest-text="Wi-Fi"', + 'slot=leading', + 'shape=icon', + ], + }, + ], }), ); assert.match(text, /✗ 5% pixels differ/); assert.match(text, /Diff image:/); + assert.match(text, /Current overlay:/); + assert.match(text, /diff\.current-overlay\.png \(1 refs\)/); assert.match(text, /500 different \/ 10000 total pixels/); + assert.match(text, /Changed regions:/); + assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, current is brighter/); + assert.match( + text, + /size=medium shape=horizontal-band density=8\.75% boundsPct=10,20,100,40 avgColor=#141414->#dcdcdc luminance=20->220/, + ); + assert.match(text, /overlaps @e1 "Continue", 12% of region/); + assert.match( + text, + /OCR text deltas \(tesseract; baselineBlocks=2 currentBlocks=2; showing 1\/1; px\):/, + ); + assert.match( + text, + /item \| text \| movePx \| sizeDeltaPx \| bboxBaseline \| bboxCurrent \| textRatio \| confidence \| issueHint/, + ); + assert.match( + text, + /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| w=1\.167 h=1 \| 94 \| possible-text-metric-mismatch/, + ); + assert.match(text, /Non-text visual deltas \(showing 1\/1; px\):/); + assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText \| evidence/); + assert.match( + text, + /1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi" \| residual-diff-outside-ocr,nearest-text="Wi-Fi",slot=leading,shape=icon/, + ); assert.equal(text.includes('\x1b['), false); }); diff --git a/src/utils/__tests__/screenshot-diff-non-text.test.ts b/src/utils/__tests__/screenshot-diff-non-text.test.ts new file mode 100644 index 000000000..93441b581 --- /dev/null +++ b/src/utils/__tests__/screenshot-diff-non-text.test.ts @@ -0,0 +1,80 @@ +import assert from 'node:assert/strict'; +import { test } from 'vitest'; +import { summarizeNonTextDiffDeltas } from '../screenshot-diff-non-text.ts'; + +function paintMaskRect( + mask: Uint8Array, + imageWidth: number, + rect: { x: number; y: number; width: number; height: number }, +): void { + for (let y = rect.y; y < rect.y + rect.height; y += 1) { + for (let x = rect.x; x < rect.x + rect.width; x += 1) { + mask[y * imageWidth + x] = 1; + } + } +} + +test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residuals', () => { + const width = 220; + const height = 120; + const diffMask = new Uint8Array(width * height); + paintMaskRect(diffMask, width, { x: 20, y: 30, width: 20, height: 20 }); + paintMaskRect(diffMask, width, { x: 70, y: 32, width: 48, height: 12 }); + + const deltas = summarizeNonTextDiffDeltas({ + diffMask, + width, + height, + regions: [ + { + index: 1, + rect: { x: 0, y: 20, width: 180, height: 50 }, + center: { x: 90, y: 45 }, + normalizedRect: { x: 0, y: 16.67, width: 81.82, height: 41.67 }, + differentPixels: 976, + shareOfDiffPercentage: 100, + imagePercentage: 3.7, + densityPercentage: 10.84, + shape: 'horizontal-band', + size: 'medium', + location: 'center', + averageBaselineColor: { r: 0, g: 0, b: 0 }, + averageCurrentColor: { r: 255, g: 255, b: 255 }, + averageBaselineColorHex: '#000000', + averageCurrentColorHex: '#ffffff', + baselineLuminance: 0, + currentLuminance: 255, + dominantChange: 'brighter', + description: 'test region', + }, + ], + ocr: { + provider: 'tesseract', + baselineBlocks: 1, + currentBlocks: 1, + baselineBlocksRaw: [ + { + text: 'Wi-Fi', + confidence: 90, + rect: { x: 68, y: 28, width: 60, height: 24 }, + normalizedRect: { x: 30.91, y: 23.33, width: 27.27, height: 20 }, + }, + ], + currentBlocksRaw: [], + matches: [], + }, + }); + + assert.equal(deltas.length, 1); + assert.equal(deltas[0]?.regionIndex, 1); + assert.equal(deltas[0]?.slot, 'leading'); + assert.equal(deltas[0]?.likelyKind, 'icon'); + assert.deepEqual(deltas[0]?.rect, { x: 20, y: 30, width: 20, height: 20 }); + assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); + assert.deepEqual(deltas[0]?.evidence, [ + 'residual-diff-outside-ocr', + 'nearest-text="Wi-Fi"', + 'slot=leading', + 'shape=icon', + ]); +}); diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts new file mode 100644 index 000000000..ff6f51120 --- /dev/null +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -0,0 +1,71 @@ +import assert from 'node:assert/strict'; +import { test } from 'vitest'; +import { matchOcrBlocks, parseTesseractTsv } from '../screenshot-diff-ocr.ts'; + +test('parseTesseractTsv groups word rows into text line blocks', () => { + const blocks = parseTesseractTsv( + [ + 'level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext', + '5\t1\t1\t1\t1\t1\t100\t200\t40\t20\t96\tAirplane', + '5\t1\t1\t1\t1\t2\t150\t200\t30\t20\t94\tMode', + '5\t1\t1\t1\t1\t3\t300\t200\t90\t20\t92\tDisconnected', + '5\t1\t1\t1\t2\t1\t100\t240\t50\t20\t90\tWi-Fi', + '5\t1\t1\t1\t3\t1\t100\t280\t10\t20\t-1\t', + ].join('\n'), + 400, + 800, + ); + + assert.equal(blocks.length, 3); + assert.deepEqual(blocks[0], { + text: 'Airplane Mode', + confidence: 95, + rect: { x: 100, y: 200, width: 80, height: 20 }, + normalizedRect: { x: 25, y: 25, width: 20, height: 2.5 }, + }); + assert.deepEqual(blocks[1], { + text: 'Disconnected', + confidence: 92, + rect: { x: 300, y: 200, width: 90, height: 20 }, + normalizedRect: { x: 75, y: 25, width: 22.5, height: 2.5 }, + }); + assert.deepEqual(blocks[2], { + text: 'Wi-Fi', + confidence: 90, + rect: { x: 100, y: 240, width: 50, height: 20 }, + normalizedRect: { x: 25, y: 30, width: 12.5, height: 2.5 }, + }); +}); + +test('matchOcrBlocks reports movement and possible text metric mismatch', () => { + const matches = matchOcrBlocks( + [ + { + text: 'Wi-Fi', + confidence: 96, + rect: { x: 100, y: 200, width: 50, height: 20 }, + normalizedRect: { x: 25, y: 25, width: 12.5, height: 2.5 }, + }, + ], + [ + { + text: 'Wi-Fi', + confidence: 94, + rect: { x: 112, y: 192, width: 60, height: 20 }, + normalizedRect: { x: 28, y: 24, width: 15, height: 2.5 }, + }, + ], + ); + + assert.equal(matches.length, 1); + assert.deepEqual(matches[0]?.delta, { x: 12, y: -8, width: 10, height: 0 }); + assert.deepEqual(matches[0]?.baselineNormalizedRect, { x: 25, y: 25, width: 12.5, height: 2.5 }); + assert.deepEqual(matches[0]?.currentNormalizedRect, { x: 28, y: 24, width: 15, height: 2.5 }); + assert.equal(matches[0]?.widthRatio, 1.2); + assert.equal(matches[0]?.heightRatio, 1); + assert.equal(matches[0]?.possibleTextMetricMismatch, true); + assert.equal( + matches[0]?.description, + 'Text "Wi-Fi" moved 12px right, 8px up; text box is 10px wider; possible font, weight, or text rendering mismatch.', + ); +}); diff --git a/src/utils/__tests__/screenshot-diff.test.ts b/src/utils/__tests__/screenshot-diff.test.ts index bc1335281..6dd98be99 100644 --- a/src/utils/__tests__/screenshot-diff.test.ts +++ b/src/utils/__tests__/screenshot-diff.test.ts @@ -27,6 +27,22 @@ function writeSolidPng( fs.writeFileSync(filePath, PNG.sync.write(png)); } +function paintRect( + png: PNG, + rect: { x: number; y: number; width: number; height: number }, + color: { r: number; g: number; b: number }, +): void { + for (let y = rect.y; y < rect.y + rect.height; y += 1) { + for (let x = rect.x; x < rect.x + rect.width; x += 1) { + const index = (y * png.width + x) * 4; + png.data[index] = color.r; + png.data[index + 1] = color.g; + png.data[index + 2] = color.b; + png.data[index + 3] = 255; + } + } +} + test('identical images produce match: true with 0% mismatch', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); @@ -85,6 +101,91 @@ test('completely different images produce match: false with 100% mismatch', asyn assert.ok(fs.existsSync(diffOut), 'diff image should be written'); }); +test('changed pixels are summarized into nearby diff regions', async () => { + const dir = tmpDir(); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + const diffOut = path.join(dir, 'diff.png'); + + writeSolidPng(baseline, 40, 20, { r: 0, g: 0, b: 0 }); + + const currentPng = new PNG({ width: 40, height: 20 }); + for (let i = 0; i < currentPng.data.length; i += 4) { + currentPng.data[i] = 0; + currentPng.data[i + 1] = 0; + currentPng.data[i + 2] = 0; + currentPng.data[i + 3] = 255; + } + paintRect(currentPng, { x: 2, y: 2, width: 4, height: 4 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 10, y: 2, width: 4, height: 4 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 30, y: 15, width: 4, height: 4 }, { r: 255, g: 255, b: 255 }); + fs.writeFileSync(current, PNG.sync.write(currentPng)); + + const result = await compareScreenshots(baseline, current, { + outputPath: diffOut, + threshold: 0, + }); + + assert.equal(result.differentPixels, 48); + assert.equal(result.regions?.length, 2); + assert.deepEqual(result.regions?.[0]?.rect, { x: 2, y: 2, width: 12, height: 4 }); + assert.equal(result.regions?.[0]?.differentPixels, 32); + assert.equal(result.regions?.[0]?.shareOfDiffPercentage, 66.67); + assert.deepEqual(result.regions?.[0]?.normalizedRect, { x: 5, y: 10, width: 30, height: 20 }); + assert.equal(result.regions?.[0]?.densityPercentage, 66.67); + assert.equal(result.regions?.[0]?.shape, 'horizontal-band'); + assert.equal(result.regions?.[0]?.size, 'large'); + assert.equal(result.regions?.[0]?.averageBaselineColorHex, '#000000'); + assert.equal(result.regions?.[0]?.averageCurrentColorHex, '#ffffff'); + assert.equal(result.regions?.[0]?.baselineLuminance, 0); + assert.equal(result.regions?.[0]?.currentLuminance, 255); + assert.equal(result.regions?.[0]?.location, 'top-left'); + assert.equal(result.regions?.[0]?.dominantChange, 'brighter'); + assert.equal( + result.regions?.[0]?.description, + "large region (horizontal-band) in the top-left; 66.67% of this region's pixels differ; current is brighter.", + ); + assert.deepEqual(result.regions?.[1]?.rect, { x: 30, y: 15, width: 4, height: 4 }); + + const diffPng = PNG.sync.read(fs.readFileSync(diffOut)); + const borderPixel = (2 * diffPng.width + 2) * 4; + assert.equal(diffPng.data[borderPixel], 0); + assert.equal(diffPng.data[borderPixel + 1], 187); + assert.equal(diffPng.data[borderPixel + 2], 255); +}); + +test('large connected diff regions are split at horizontal low-density bands', async () => { + const dir = tmpDir(); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + + writeSolidPng(baseline, 100, 220, { r: 0, g: 0, b: 0 }); + + const currentPng = new PNG({ width: 100, height: 220 }); + for (let i = 0; i < currentPng.data.length; i += 4) { + currentPng.data[i] = 0; + currentPng.data[i + 1] = 0; + currentPng.data[i + 2] = 0; + currentPng.data[i + 3] = 255; + } + paintRect(currentPng, { x: 0, y: 0, width: 100, height: 80 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 50, y: 80, width: 1, height: 50 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 0, y: 130, width: 100, height: 90 }, { r: 255, g: 255, b: 255 }); + fs.writeFileSync(current, PNG.sync.write(currentPng)); + + const result = await compareScreenshots(baseline, current, { + outputPath: path.join(dir, 'diff.png'), + threshold: 0, + }); + + assert.equal(result.regions?.length, 2); + const rectsByTop = result.regions?.map((region) => region.rect).sort((a, b) => a.y - b.y); + assert.deepEqual(rectsByTop, [ + { x: 0, y: 0, width: 100, height: 106 }, + { x: 0, y: 106, width: 100, height: 114 }, + ]); +}); + test('no diff path is persisted when outputPath is omitted', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); @@ -99,7 +200,7 @@ test('no diff path is persisted when outputPath is omitted', async () => { assert.equal(result.diffPath, undefined); }); -test('diff image marks different pixels as red and unchanged as dimmed gray', async () => { +test('diff image marks changed pixels red over a light current-screen context', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); const current = path.join(dir, 'current.png'); @@ -143,16 +244,15 @@ test('diff image marks different pixels as red and unchanged as dimmed gray', as // Read the diff image and verify pixel colors const diffPng = PNG.sync.read(fs.readFileSync(diffOut)); - // Pixel 0 (unchanged white): should be dimmed gray - // gray = round((255+255+255)/3) = 255, dimmed = round(255*0.3) = 77 - assert.equal(diffPng.data[0], 77); // R - assert.equal(diffPng.data[1], 77); // G - assert.equal(diffPng.data[2], 77); // B + // Pixel 0 (unchanged white): should stay light as screenshot context. + assert.equal(diffPng.data[0], 255); // R + assert.equal(diffPng.data[1], 255); // G + assert.equal(diffPng.data[2], 255); // B - // Pixel 1 (different): should be red - assert.equal(diffPng.data[4], 255); // R - assert.equal(diffPng.data[5], 0); // G - assert.equal(diffPng.data[6], 0); // B + // Pixel 1 (different): should be red-tinted while preserving light context. + assert.equal(diffPng.data[4], 228); // R + assert.equal(diffPng.data[5], 56); // G + assert.equal(diffPng.data[6], 56); // B }); test('dimension mismatch returns expected vs actual sizes', async () => { diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 944d90a6e..f74d25b51 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -886,7 +886,7 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ type: 'boolean', usageLabel: '--overlay-refs', usageDescription: - 'Screenshot: draw current snapshot refs and target rectangles onto the saved PNG', + 'Screenshot: draw current snapshot refs and target rectangles onto the saved PNG; diff screenshot: also write a separate current-screen overlay guide', }, { key: 'screenshotFullscreen', @@ -993,11 +993,11 @@ const COMMAND_SCHEMAS: Record = { }, diff: { usageOverride: - 'diff snapshot | diff screenshot --baseline [--out ] [--threshold <0-1>]', + 'diff snapshot | diff screenshot --baseline [--out ] [--threshold <0-1>] [--overlay-refs]', helpDescription: 'Diff accessibility snapshot or compare screenshots pixel-by-pixel', summary: 'Diff snapshot or screenshot', positionalArgs: ['kind'], - allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out'], + allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out', 'overlayRefs'], }, 'ensure-simulator': { helpDescription: 'Ensure an iOS simulator exists in a device set (create if missing)', diff --git a/src/utils/output.ts b/src/utils/output.ts index da2f339c8..20ff14f7f 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -4,6 +4,7 @@ import { buildSnapshotDisplayLines, formatSnapshotLine } from './snapshot-lines. import type { SnapshotNode, SnapshotVisibility } from './snapshot.ts'; import { displayNodeLabel } from './snapshot-tree.ts'; import type { ScreenshotDiffResult } from './screenshot-diff.ts'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; import { styleText } from 'node:util'; import { buildMobileSnapshotPresentation } from './mobile-snapshot-semantics.ts'; @@ -227,14 +228,132 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { lines.push(` ${label} ${displayPath}`); } + if (data.currentOverlayPath && !match) { + const relativePath = toRelativePath(data.currentOverlayPath); + const label = useColor ? colorize('Current overlay:', 'dim') : 'Current overlay:'; + const displayPath = useColor ? colorize(relativePath, 'green') : relativePath; + const refCount = Array.isArray(data.currentOverlayRefs) ? data.currentOverlayRefs.length : 0; + const refSuffix = refCount > 0 ? ` (${refCount} refs)` : ''; + lines.push(` ${label} ${displayPath}${refSuffix}`); + } + if (!match && !dimensionMismatch) { const diffCount = useColor ? colorize(String(differentPixels), 'red') : String(differentPixels); lines.push(` ${diffCount} different / ${totalPixels} total pixels`); } + const regions = Array.isArray(data.regions) ? data.regions : []; + if (!match && !dimensionMismatch && regions.length > 0) { + lines.push(' Changed regions:'); + for (const region of regions.slice(0, 5)) { + const share = + region.shareOfDiffPercentage === 0 && region.differentPixels > 0 + ? '<0.01' + : String(region.shareOfDiffPercentage); + const rect = region.rect; + lines.push( + ` ${region.index}. ${region.location} x=${rect.x} y=${rect.y} ` + + `${rect.width}x${rect.height}, ${share}% of diff, ` + + formatDominantScreenshotChange(region.dominantChange), + ); + const detailLine = formatScreenshotRegionDetails(region); + if (detailLine) { + lines.push(` ${detailLine}`); + } + const bestMatch = region.currentOverlayMatches?.[0]; + if (bestMatch) { + const label = bestMatch.label ? ` "${bestMatch.label}"` : ''; + lines.push( + ` overlaps @${bestMatch.ref}${label}, ` + + `${bestMatch.regionCoveragePercentage}% of region`, + ); + } + } + } + + const ocrMatches = data.ocr?.matches ?? []; + if (!match && !dimensionMismatch && ocrMatches.length > 0) { + const shownOcrMatches = ocrMatches.slice(0, 8); + lines.push( + ` OCR text deltas (${data.ocr?.provider}; baselineBlocks=${data.ocr?.baselineBlocks} ` + + `currentBlocks=${data.ocr?.currentBlocks}; showing ${shownOcrMatches.length}/${ocrMatches.length}; px):`, + ); + lines.push( + ' item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | textRatio | confidence | issueHint', + ); + for (const [index, match] of shownOcrMatches.entries()) { + const delta = match.delta; + lines.push( + ` ${index + 1} | ${JSON.stringify(match.text)} | ` + + `${formatSignedPixels(delta.x)},${formatSignedPixels(delta.y)} | ` + + `${formatSignedPixels(delta.width)},${formatSignedPixels(delta.height)} | ` + + `${formatRect(match.baselineRect)} | ${formatRect(match.currentRect)} | ` + + `w=${match.widthRatio} h=${match.heightRatio} | ${match.confidence} | ` + + `${match.possibleTextMetricMismatch ? 'possible-text-metric-mismatch' : '-'}`, + ); + } + } + + const nonTextDeltas = data.nonTextDeltas ?? []; + if (!match && !dimensionMismatch && nonTextDeltas.length > 0) { + const shownNonTextDeltas = nonTextDeltas.slice(0, 8); + lines.push( + ` Non-text visual deltas (showing ${shownNonTextDeltas.length}/${nonTextDeltas.length}; px):`, + ); + lines.push(' item | region | slot | kind | bboxCurrent | nearestText | evidence'); + for (const delta of shownNonTextDeltas) { + lines.push( + ` ${delta.index} | ${delta.regionIndex ? `r${delta.regionIndex}` : '-'} | ` + + `${delta.slot} | ${delta.likelyKind} | ${formatRect(delta.rect)} | ` + + `${delta.nearestText ? JSON.stringify(delta.nearestText) : '-'} | ` + + `${delta.evidence.join(',')}`, + ); + } + } + return `${lines.join('\n')}\n`; } +function formatRect(rect: { x: number; y: number; width: number; height: number }): string { + return `x=${rect.x},y=${rect.y},w=${rect.width},h=${rect.height}`; +} + +function formatSignedPixels(value: number): string { + return value > 0 ? `+${value}` : String(value); +} + +function formatDominantScreenshotChange(change: string | undefined): string { + switch (change) { + case 'brighter': + return 'current is brighter'; + case 'darker': + return 'current is darker'; + case 'color-shift': + return 'color shifted'; + default: + return 'mixed change'; + } +} + +function formatScreenshotRegionDetails(region: ScreenshotDiffRegion): string | null { + const normalizedRect = region.normalizedRect; + const details = [ + region.size ? `size=${region.size}` : null, + region.shape ? `shape=${region.shape}` : null, + typeof region.densityPercentage === 'number' ? `density=${region.densityPercentage}%` : null, + normalizedRect + ? `boundsPct=${normalizedRect.x},${normalizedRect.y},${normalizedRect.width},${normalizedRect.height}` + : null, + region.averageBaselineColorHex && region.averageCurrentColorHex + ? `avgColor=${region.averageBaselineColorHex}->${region.averageCurrentColorHex}` + : null, + typeof region.baselineLuminance === 'number' && typeof region.currentLuminance === 'number' + ? `luminance=${region.baselineLuminance}->${region.currentLuminance}` + : null, + ].filter((entry): entry is string => entry !== null); + return details.length > 0 ? details.join(' ') : null; +} + function toRelativePath(filePath: string): string { const cwd = process.cwd(); const relativePath = path.relative(cwd, filePath); diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts new file mode 100644 index 000000000..6acbd18f1 --- /dev/null +++ b/src/utils/screenshot-diff-non-text.ts @@ -0,0 +1,364 @@ +import type { Rect } from './snapshot.ts'; +import type { ScreenshotOcrAnalysis, ScreenshotOcrBlock } from './screenshot-diff-ocr.ts'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; + +export type ScreenshotNonTextDelta = { + index: number; + regionIndex?: number; + slot: 'leading' | 'trailing' | 'background' | 'separator' | 'unknown'; + likelyKind: 'icon' | 'toggle' | 'chevron' | 'separator' | 'card-or-background' | 'visual'; + rect: Rect; + normalizedRect: Rect; + differentPixels: number; + densityPercentage: number; + nearestText?: string; + nearestTextDistancePx?: number; + evidence: string[]; +}; + +const MAX_NON_TEXT_DELTAS = 12; +const OCR_MASK_PADDING_PX = 8; +const MIN_COMPONENT_PIXELS = 24; +const MIN_COMPONENT_SIDE = 3; +const MERGE_GAP_PX = 10; +const MIN_CONTENT_Y_RATIO = 0.08; +const KIND_SCORE = { + icon: 90, + toggle: 90, + chevron: 75, + separator: 45, + visual: 35, + 'card-or-background': 10, +} satisfies Record; +const SLOT_SCORE = { + leading: 20, + trailing: 20, + separator: 10, + unknown: 0, + background: -30, +} satisfies Record; + +type MutableComponent = { + minX: number; + minY: number; + maxX: number; + maxY: number; + differentPixels: number; +}; + +export function summarizeNonTextDiffDeltas(params: { + diffMask: Uint8Array; + width: number; + height: number; + regions: ScreenshotDiffRegion[]; + ocr?: ScreenshotOcrAnalysis; + maxDeltas?: number; +}): ScreenshotNonTextDelta[] { + const maskedDiff = maskOcrText(params.diffMask, params.width, params.height, params.ocr); + const rawComponents = findConnectedComponents(maskedDiff, params.width, params.height); + const mergedComponents = mergeNearbyComponents(rawComponents, MERGE_GAP_PX); + const textBlocks = getOcrBlocks(params.ocr); + return mergedComponents + .filter(hasUsefulComponentSize) + .map((component) => toNonTextDelta(component, params, textBlocks)) + .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) + .sort((left, right) => scoreNonTextDelta(right) - scoreNonTextDelta(left)) + .slice(0, Math.max(0, params.maxDeltas ?? MAX_NON_TEXT_DELTAS)) + .map((delta, index) => ({ ...delta, index: index + 1 })); +} + +function maskOcrText( + diffMask: Uint8Array, + width: number, + height: number, + ocr: ScreenshotOcrAnalysis | undefined, +): Uint8Array { + const maskedDiff = new Uint8Array(diffMask); + if (!ocr) return maskedDiff; + for (const block of [...ocr.baselineBlocksRaw, ...ocr.currentBlocksRaw]) { + clearRect(maskedDiff, width, height, expandRect(block.rect, OCR_MASK_PADDING_PX)); + } + return maskedDiff; +} + +function findConnectedComponents( + mask: Uint8Array, + width: number, + height: number, +): MutableComponent[] { + const visited = new Uint8Array(mask.length); + const queue = new Int32Array(mask.length); + const components: MutableComponent[] = []; + for (let pixelIndex = 0; pixelIndex < mask.length; pixelIndex += 1) { + if (mask[pixelIndex] !== 1 || visited[pixelIndex] === 1) continue; + let queueStart = 0; + let queueEnd = 0; + queue[queueEnd] = pixelIndex; + queueEnd += 1; + visited[pixelIndex] = 1; + + const startX = pixelIndex % width; + const startY = Math.floor(pixelIndex / width); + const component: MutableComponent = { + minX: startX, + minY: startY, + maxX: startX, + maxY: startY, + differentPixels: 0, + }; + + while (queueStart < queueEnd) { + const currentIndex = queue[queueStart]!; + queueStart += 1; + const x = currentIndex % width; + const y = Math.floor(currentIndex / width); + component.minX = Math.min(component.minX, x); + component.minY = Math.min(component.minY, y); + component.maxX = Math.max(component.maxX, x); + component.maxY = Math.max(component.maxY, y); + component.differentPixels += 1; + + for (let yOffset = -1; yOffset <= 1; yOffset += 1) { + const neighborY = y + yOffset; + if (neighborY < 0 || neighborY >= height) continue; + for (let xOffset = -1; xOffset <= 1; xOffset += 1) { + if (xOffset === 0 && yOffset === 0) continue; + const neighborX = x + xOffset; + if (neighborX < 0 || neighborX >= width) continue; + const neighborIndex = neighborY * width + neighborX; + if (mask[neighborIndex] !== 1 || visited[neighborIndex] === 1) continue; + visited[neighborIndex] = 1; + queue[queueEnd] = neighborIndex; + queueEnd += 1; + } + } + } + components.push(component); + } + return components; +} + +function mergeNearbyComponents(components: MutableComponent[], gapPx: number): MutableComponent[] { + const merged: MutableComponent[] = []; + for (const component of components.sort( + (left, right) => left.minY - right.minY || left.minX - right.minX, + )) { + const existing = merged.find((candidate) => componentsAreNear(candidate, component, gapPx)); + if (!existing) { + merged.push({ ...component }); + continue; + } + existing.minX = Math.min(existing.minX, component.minX); + existing.minY = Math.min(existing.minY, component.minY); + existing.maxX = Math.max(existing.maxX, component.maxX); + existing.maxY = Math.max(existing.maxY, component.maxY); + existing.differentPixels += component.differentPixels; + } + return merged; +} + +function toNonTextDelta( + component: MutableComponent, + params: { + width: number; + height: number; + regions: ScreenshotDiffRegion[]; + }, + textBlocks: ScreenshotOcrBlock[], +): Omit { + const rect = componentToRect(component); + const regionIndex = findContainingRegionIndex(rect, params.regions); + const nearestText = findNearestText(rect, textBlocks); + const slot = classifySlot(rect, nearestText?.block.rect, params.width); + const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels); + const evidence = buildEvidence(slot, likelyKind, nearestText?.block.text); + return { + ...(regionIndex ? { regionIndex } : {}), + slot, + likelyKind, + rect, + normalizedRect: { + x: roundPercentage(rect.x / params.width), + y: roundPercentage(rect.y / params.height), + width: roundPercentage(rect.width / params.width), + height: roundPercentage(rect.height / params.height), + }, + differentPixels: component.differentPixels, + densityPercentage: roundPercentage(component.differentPixels / (rect.width * rect.height)), + ...(nearestText + ? { + nearestText: nearestText.block.text, + nearestTextDistancePx: Math.round(nearestText.distance), + } + : {}), + evidence, + }; +} + +function classifySlot( + rect: Rect, + nearestTextRect: Rect | undefined, + imageWidth: number, +): ScreenshotNonTextDelta['slot'] { + if (rect.height <= 3 && rect.width >= 60) return 'separator'; + if (!nearestTextRect) { + if (rect.width >= imageWidth * 0.4) return 'background'; + return 'unknown'; + } + if (rect.width >= imageWidth * 0.4) return 'background'; + const rectCenterX = rect.x + rect.width / 2; + const textCenterX = nearestTextRect.x + nearestTextRect.width / 2; + if (rectCenterX < textCenterX - nearestTextRect.width / 2) return 'leading'; + if (rectCenterX > textCenterX + nearestTextRect.width / 2) return 'trailing'; + return rect.width >= imageWidth * 0.35 ? 'background' : 'unknown'; +} + +function classifyLikelyKind( + rect: Rect, + slot: ScreenshotNonTextDelta['slot'], + differentPixels: number, +): ScreenshotNonTextDelta['likelyKind'] { + const aspect = rect.width / rect.height; + const density = differentPixels / (rect.width * rect.height); + if (slot === 'separator') return 'separator'; + if (slot === 'background') return 'card-or-background'; + if (slot === 'trailing' && aspect >= 1.5 && aspect <= 3.8 && density >= 0.35) return 'toggle'; + if (slot === 'trailing' && rect.width <= 44 && rect.height <= 64) return 'chevron'; + if (slot === 'leading' && aspect >= 0.55 && aspect <= 1.8) return 'icon'; + if (rect.width >= 300 || rect.height >= 160) return 'card-or-background'; + return 'visual'; +} + +function scoreNonTextDelta(delta: Omit): number { + const sizePenalty = delta.rect.width >= 300 || delta.rect.height >= 160 ? -35 : 0; + const regionScore = delta.regionIndex ? 20 : 0; + return ( + KIND_SCORE[delta.likelyKind] + + SLOT_SCORE[delta.slot] + + regionScore + + sizePenalty + + Math.min(20, delta.differentPixels / 200) + ); +} + +function buildEvidence( + slot: ScreenshotNonTextDelta['slot'], + likelyKind: ScreenshotNonTextDelta['likelyKind'], + nearestText: string | undefined, +): string[] { + const evidence = ['residual-diff-outside-ocr']; + if (nearestText) evidence.push(`nearest-text=${JSON.stringify(nearestText)}`); + if (slot !== 'unknown') evidence.push(`slot=${slot}`); + evidence.push(`shape=${likelyKind}`); + return evidence; +} + +function findContainingRegionIndex( + rect: Rect, + regions: ScreenshotDiffRegion[], +): number | undefined { + let bestRegion: ScreenshotDiffRegion | undefined; + let bestOverlap = 0; + for (const region of regions) { + const overlap = intersectArea(rect, region.rect); + if (overlap <= bestOverlap) continue; + bestOverlap = overlap; + bestRegion = region; + } + return bestRegion?.index; +} + +function findNearestText( + rect: Rect, + textBlocks: ScreenshotOcrBlock[], +): { block: ScreenshotOcrBlock; distance: number } | undefined { + let nearest: { block: ScreenshotOcrBlock; distance: number } | undefined; + const center = rectCenter(rect); + for (const block of textBlocks) { + const distance = Math.sqrt(squaredDistance(center, rectCenter(block.rect))); + if (nearest && distance >= nearest.distance) continue; + nearest = { block, distance }; + } + return nearest; +} + +function getOcrBlocks(ocr: ScreenshotOcrAnalysis | undefined): ScreenshotOcrBlock[] { + return ocr ? [...ocr.baselineBlocksRaw, ...ocr.currentBlocksRaw] : []; +} + +function hasUsefulComponentSize(component: MutableComponent): boolean { + const rect = componentToRect(component); + return ( + component.differentPixels >= MIN_COMPONENT_PIXELS && + rect.width >= MIN_COMPONENT_SIDE && + rect.height >= MIN_COMPONENT_SIDE + ); +} + +function componentToRect(component: MutableComponent): Rect { + return { + x: component.minX, + y: component.minY, + width: component.maxX - component.minX + 1, + height: component.maxY - component.minY + 1, + }; +} + +function expandRect(rect: Rect, padding: number): Rect { + return { + x: rect.x - padding, + y: rect.y - padding, + width: rect.width + padding * 2, + height: rect.height + padding * 2, + }; +} + +function clearRect(mask: Uint8Array, width: number, height: number, rect: Rect): void { + const minX = clamp(Math.floor(rect.x), 0, width - 1); + const minY = clamp(Math.floor(rect.y), 0, height - 1); + const maxX = clamp(Math.ceil(rect.x + rect.width), 0, width); + const maxY = clamp(Math.ceil(rect.y + rect.height), 0, height); + for (let y = minY; y < maxY; y += 1) { + for (let x = minX; x < maxX; x += 1) { + mask[y * width + x] = 0; + } + } +} + +function componentsAreNear( + left: MutableComponent, + right: MutableComponent, + gapPx: number, +): boolean { + return ( + left.minX - gapPx <= right.maxX && + right.minX - gapPx <= left.maxX && + left.minY - gapPx <= right.maxY && + right.minY - gapPx <= left.maxY + ); +} + +function intersectArea(left: Rect, right: Rect): number { + const minX = Math.max(left.x, right.x); + const minY = Math.max(left.y, right.y); + const maxX = Math.min(left.x + left.width, right.x + right.width); + const maxY = Math.min(left.y + left.height, right.y + right.height); + if (maxX <= minX || maxY <= minY) return 0; + return (maxX - minX) * (maxY - minY); +} + +function rectCenter(rect: Rect): { x: number; y: number } { + return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; +} + +function squaredDistance(left: { x: number; y: number }, right: { x: number; y: number }): number { + return (left.x - right.x) ** 2 + (left.y - right.y) ** 2; +} + +function clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts new file mode 100644 index 000000000..1b4679762 --- /dev/null +++ b/src/utils/screenshot-diff-ocr.ts @@ -0,0 +1,373 @@ +import type { Rect } from './snapshot.ts'; +import { runCmd, whichCmd } from './exec.ts'; + +export type ScreenshotOcrBlock = { + text: string; + confidence: number; + rect: Rect; + normalizedRect: Rect; +}; + +export type ScreenshotOcrTextMatch = { + text: string; + baselineRect: Rect; + currentRect: Rect; + baselineNormalizedRect: Rect; + currentNormalizedRect: Rect; + delta: { x: number; y: number; width: number; height: number }; + confidence: number; + widthRatio: number; + heightRatio: number; + possibleTextMetricMismatch: boolean; + description: string; +}; + +export type ScreenshotOcrSummary = { + provider: 'tesseract'; + baselineBlocks: number; + currentBlocks: number; + matches: ScreenshotOcrTextMatch[]; +}; + +export type ScreenshotOcrAnalysis = ScreenshotOcrSummary & { + baselineBlocksRaw: ScreenshotOcrBlock[]; + currentBlocksRaw: ScreenshotOcrBlock[]; +}; + +type TesseractWord = { + key: string; + text: string; + confidence: number; + rect: Rect; +}; + +const OCR_TIMEOUT_MS = 10_000; +const MAX_OCR_MATCHES = 12; +const MIN_MEANINGFUL_DELTA_PX = 2; +const MIN_SEGMENT_GAP_PX = 48; +const TEXT_WIDTH_MISMATCH_RATIO = 0.08; +const TEXT_HEIGHT_MISMATCH_RATIO = 0.12; + +export async function summarizeScreenshotOcr(params: { + baselinePath: string; + currentPath: string; + width: number; + height: number; +}): Promise { + if (!(await whichCmd('tesseract'))) return undefined; + + try { + const [baselineResult, currentResult] = await Promise.all([ + runTesseractTsv(params.baselinePath), + runTesseractTsv(params.currentPath), + ]); + if (baselineResult.exitCode !== 0 || currentResult.exitCode !== 0) return undefined; + + const baselineBlocks = parseTesseractTsv(baselineResult.stdout, params.width, params.height); + const currentBlocks = parseTesseractTsv(currentResult.stdout, params.width, params.height); + const matches = matchOcrBlocks(baselineBlocks, currentBlocks); + if (baselineBlocks.length === 0 && currentBlocks.length === 0) return undefined; + + return { + provider: 'tesseract', + baselineBlocks: baselineBlocks.length, + currentBlocks: currentBlocks.length, + baselineBlocksRaw: baselineBlocks, + currentBlocksRaw: currentBlocks, + matches, + }; + } catch { + return undefined; + } +} + +export function toScreenshotOcrSummary(analysis: ScreenshotOcrAnalysis): ScreenshotOcrSummary { + return { + provider: analysis.provider, + baselineBlocks: analysis.baselineBlocks, + currentBlocks: analysis.currentBlocks, + matches: analysis.matches, + }; +} + +export function parseTesseractTsv( + tsv: string, + imageWidth: number, + imageHeight: number, +): ScreenshotOcrBlock[] { + const [headerLine, ...lines] = tsv.split(/\r?\n/); + if (!headerLine) return []; + + const headers = headerLine.split('\t'); + const indexByName = new Map(headers.map((header, index) => [header, index])); + const words: TesseractWord[] = []; + for (const line of lines) { + if (!line.trim()) continue; + const values = line.split('\t'); + const level = readTsvNumber(values, indexByName, 'level'); + const rawText = readTsvString(values, indexByName, 'text').trim(); + const confidence = readTsvNumber(values, indexByName, 'conf'); + if (level !== 5 || !isMeaningfulText(rawText) || confidence < 0) continue; + + const left = readTsvNumber(values, indexByName, 'left'); + const top = readTsvNumber(values, indexByName, 'top'); + const width = readTsvNumber(values, indexByName, 'width'); + const height = readTsvNumber(values, indexByName, 'height'); + if (width <= 0 || height <= 0) continue; + + words.push({ + key: [ + readTsvString(values, indexByName, 'page_num'), + readTsvString(values, indexByName, 'block_num'), + readTsvString(values, indexByName, 'par_num'), + readTsvString(values, indexByName, 'line_num'), + ].join(':'), + text: rawText, + confidence, + rect: { x: left, y: top, width, height }, + }); + } + + const wordsByLine = new Map(); + for (const word of words) { + const existing = wordsByLine.get(word.key); + if (existing) existing.push(word); + else wordsByLine.set(word.key, [word]); + } + + return Array.from(wordsByLine.values()) + .flatMap((lineWords) => splitLineWordsIntoSegments(lineWords)) + .map((segmentWords) => toOcrBlock(segmentWords, imageWidth, imageHeight)) + .filter((block): block is ScreenshotOcrBlock => block !== null); +} + +export function matchOcrBlocks( + baselineBlocks: ScreenshotOcrBlock[], + currentBlocks: ScreenshotOcrBlock[], +): ScreenshotOcrTextMatch[] { + const usedCurrent = new Set(); + const matches: ScreenshotOcrTextMatch[] = []; + + for (const baselineBlock of baselineBlocks) { + const normalizedText = normalizeTextForMatching(baselineBlock.text); + const currentIndex = findBestCurrentMatch( + baselineBlock, + normalizedText, + currentBlocks, + usedCurrent, + ); + if (currentIndex === null) continue; + usedCurrent.add(currentIndex); + + const currentBlock = currentBlocks[currentIndex]!; + const match = toOcrTextMatch(baselineBlock, currentBlock); + if (!hasMeaningfulOcrDelta(match)) continue; + matches.push(match); + } + + return matches + .sort((left, right) => scoreOcrMatch(right) - scoreOcrMatch(left)) + .slice(0, MAX_OCR_MATCHES); +} + +function runTesseractTsv(imagePath: string): ReturnType { + return runCmd('tesseract', [imagePath, 'stdout', '-l', 'eng', 'tsv'], { + allowFailure: true, + timeoutMs: OCR_TIMEOUT_MS, + }); +} + +function toOcrBlock( + words: TesseractWord[], + imageWidth: number, + imageHeight: number, +): ScreenshotOcrBlock | null { + if (words.length === 0) return null; + const sortedWords = [...words].sort((left, right) => left.rect.x - right.rect.x); + const rect = unionRects(sortedWords.map((word) => word.rect)); + const confidence = Math.round(average(sortedWords.map((word) => word.confidence)) * 100) / 100; + return { + text: sortedWords.map((word) => word.text).join(' '), + confidence, + rect, + normalizedRect: { + x: roundPercentage(rect.x / imageWidth), + y: roundPercentage(rect.y / imageHeight), + width: roundPercentage(rect.width / imageWidth), + height: roundPercentage(rect.height / imageHeight), + }, + }; +} + +function splitLineWordsIntoSegments(words: TesseractWord[]): TesseractWord[][] { + const sortedWords = [...words].sort((left, right) => left.rect.x - right.rect.x); + const segments: TesseractWord[][] = []; + let currentSegment: TesseractWord[] = []; + for (const word of sortedWords) { + const previousWord = currentSegment.at(-1); + if (!previousWord) { + currentSegment.push(word); + continue; + } + + const gap = word.rect.x - (previousWord.rect.x + previousWord.rect.width); + const height = Math.max(previousWord.rect.height, word.rect.height); + if (gap > Math.max(MIN_SEGMENT_GAP_PX, height * 2.5)) { + segments.push(currentSegment); + currentSegment = [word]; + continue; + } + currentSegment.push(word); + } + if (currentSegment.length > 0) segments.push(currentSegment); + return segments; +} + +function findBestCurrentMatch( + baselineBlock: ScreenshotOcrBlock, + normalizedText: string, + currentBlocks: ScreenshotOcrBlock[], + usedCurrent: Set, +): number | null { + let bestIndex: number | null = null; + let bestDistance = Number.POSITIVE_INFINITY; + for (let index = 0; index < currentBlocks.length; index += 1) { + if (usedCurrent.has(index)) continue; + const currentBlock = currentBlocks[index]!; + if (normalizeTextForMatching(currentBlock.text) !== normalizedText) continue; + const distance = squaredDistance( + rectCenter(baselineBlock.normalizedRect), + rectCenter(currentBlock.normalizedRect), + ); + if (distance >= bestDistance) continue; + bestIndex = index; + bestDistance = distance; + } + return bestIndex; +} + +function toOcrTextMatch( + baselineBlock: ScreenshotOcrBlock, + currentBlock: ScreenshotOcrBlock, +): ScreenshotOcrTextMatch { + const delta = { + x: currentBlock.rect.x - baselineBlock.rect.x, + y: currentBlock.rect.y - baselineBlock.rect.y, + width: currentBlock.rect.width - baselineBlock.rect.width, + height: currentBlock.rect.height - baselineBlock.rect.height, + }; + const widthRatio = roundRatio(currentBlock.rect.width / baselineBlock.rect.width); + const heightRatio = roundRatio(currentBlock.rect.height / baselineBlock.rect.height); + const possibleTextMetricMismatch = + Math.abs(widthRatio - 1) >= TEXT_WIDTH_MISMATCH_RATIO || + Math.abs(heightRatio - 1) >= TEXT_HEIGHT_MISMATCH_RATIO; + return { + text: baselineBlock.text, + baselineRect: baselineBlock.rect, + currentRect: currentBlock.rect, + baselineNormalizedRect: baselineBlock.normalizedRect, + currentNormalizedRect: currentBlock.normalizedRect, + delta, + confidence: Math.round(Math.min(baselineBlock.confidence, currentBlock.confidence) * 100) / 100, + widthRatio, + heightRatio, + possibleTextMetricMismatch, + description: describeOcrMatchDelta(baselineBlock.text, delta, possibleTextMetricMismatch), + }; +} + +function describeOcrMatchDelta( + text: string, + delta: ScreenshotOcrTextMatch['delta'], + possibleTextMetricMismatch: boolean, +): string { + const movement = [ + describePixelDelta(delta.x, 'right', 'left'), + describePixelDelta(delta.y, 'down', 'up'), + ].filter((entry): entry is string => entry !== null); + const size = [ + describePixelDelta(delta.width, 'wider', 'narrower'), + describePixelDelta(delta.height, 'taller', 'shorter'), + ].filter((entry): entry is string => entry !== null); + const parts = [ + movement.length > 0 ? `moved ${movement.join(', ')}` : null, + size.length > 0 ? `text box is ${size.join(', ')}` : null, + possibleTextMetricMismatch ? 'possible font, weight, or text rendering mismatch' : null, + ].filter((entry): entry is string => entry !== null); + return `Text "${text}" ${parts.join('; ')}.`; +} + +function describePixelDelta( + value: number, + positiveLabel: string, + negativeLabel: string, +): string | null { + if (Math.abs(value) < MIN_MEANINGFUL_DELTA_PX) return null; + return `${Math.abs(Math.round(value))}px ${value > 0 ? positiveLabel : negativeLabel}`; +} + +function hasMeaningfulOcrDelta(match: ScreenshotOcrTextMatch): boolean { + return ( + Math.abs(match.delta.x) >= MIN_MEANINGFUL_DELTA_PX || + Math.abs(match.delta.y) >= MIN_MEANINGFUL_DELTA_PX || + Math.abs(match.delta.width) >= MIN_MEANINGFUL_DELTA_PX || + Math.abs(match.delta.height) >= MIN_MEANINGFUL_DELTA_PX || + match.possibleTextMetricMismatch + ); +} + +function scoreOcrMatch(match: ScreenshotOcrTextMatch): number { + return ( + Math.abs(match.delta.x) + + Math.abs(match.delta.y) + + Math.abs(match.delta.width) + + Math.abs(match.delta.height) + + (match.possibleTextMetricMismatch ? 25 : 0) + ); +} + +function unionRects(rects: Rect[]): Rect { + const minX = Math.min(...rects.map((rect) => rect.x)); + const minY = Math.min(...rects.map((rect) => rect.y)); + const maxX = Math.max(...rects.map((rect) => rect.x + rect.width)); + const maxY = Math.max(...rects.map((rect) => rect.y + rect.height)); + return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; +} + +function rectCenter(rect: Rect): { x: number; y: number } { + return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; +} + +function squaredDistance(left: { x: number; y: number }, right: { x: number; y: number }): number { + return (left.x - right.x) ** 2 + (left.y - right.y) ** 2; +} + +function readTsvString(values: string[], indexByName: Map, name: string): string { + const index = indexByName.get(name); + return index === undefined ? '' : (values[index] ?? ''); +} + +function readTsvNumber(values: string[], indexByName: Map, name: string): number { + const value = Number(readTsvString(values, indexByName, name)); + return Number.isFinite(value) ? value : 0; +} + +function isMeaningfulText(text: string): boolean { + return /[\p{L}\p{N}]/u.test(text); +} + +function normalizeTextForMatching(text: string): string { + return text.trim().replace(/\s+/g, ' ').toLowerCase(); +} + +function average(values: number[]): number { + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} + +function roundRatio(ratio: number): number { + return Math.round(ratio * 1000) / 1000; +} diff --git a/src/utils/screenshot-diff-overlay-matches.ts b/src/utils/screenshot-diff-overlay-matches.ts new file mode 100644 index 000000000..46f75fa86 --- /dev/null +++ b/src/utils/screenshot-diff-overlay-matches.ts @@ -0,0 +1,61 @@ +import type { + ScreenshotDiffRegion, + ScreenshotDiffRegionOverlayMatch, +} from './screenshot-diff-regions.ts'; +import type { Rect, ScreenshotOverlayRef } from './snapshot.ts'; + +const MAX_MATCHES_PER_REGION = 3; + +export function attachCurrentOverlayMatches( + regions: ScreenshotDiffRegion[], + overlayRefs: ScreenshotOverlayRef[], +): ScreenshotDiffRegion[] { + return regions.map((region) => { + const matches = findRegionOverlayMatches(region, overlayRefs); + return matches.length > 0 ? { ...region, currentOverlayMatches: matches } : region; + }); +} + +function findRegionOverlayMatches( + region: ScreenshotDiffRegion, + overlayRefs: ScreenshotOverlayRef[], +): ScreenshotDiffRegionOverlayMatch[] { + const regionArea = rectArea(region.rect); + return overlayRefs + .map((overlayRef) => { + const overlayRect = overlayRef.overlayRect; + const overlapArea = intersectArea(region.rect, overlayRect); + if (overlapArea <= 0) return null; + return { + ref: overlayRef.ref, + ...(overlayRef.label ? { label: overlayRef.label } : {}), + rect: overlayRect, + overlapPercentage: roundPercentage(overlapArea / rectArea(overlayRect)), + regionCoveragePercentage: roundPercentage(overlapArea / regionArea), + }; + }) + .filter((match): match is ScreenshotDiffRegionOverlayMatch => match !== null) + .sort((left, right) => { + const coverageDelta = right.regionCoveragePercentage - left.regionCoveragePercentage; + if (coverageDelta !== 0) return coverageDelta; + return right.overlapPercentage - left.overlapPercentage; + }) + .slice(0, MAX_MATCHES_PER_REGION); +} + +function intersectArea(left: Rect, right: Rect): number { + const minX = Math.max(left.x, right.x); + const minY = Math.max(left.y, right.y); + const maxX = Math.min(left.x + left.width, right.x + right.width); + const maxY = Math.min(left.y + left.height, right.y + right.height); + if (maxX <= minX || maxY <= minY) return 0; + return (maxX - minX) * (maxY - minY); +} + +function rectArea(rect: Rect): number { + return rect.width * rect.height; +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} diff --git a/src/utils/screenshot-diff-region-overlay.ts b/src/utils/screenshot-diff-region-overlay.ts new file mode 100644 index 000000000..516a7fefb --- /dev/null +++ b/src/utils/screenshot-diff-region-overlay.ts @@ -0,0 +1,53 @@ +import { PNG } from 'pngjs'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; + +const REGION_BORDER_COLOR = [0, 187, 255, 255] as const; +const REGION_BORDER_THICKNESS = 2; +const MIN_ANNOTATED_REGION_SIDE = 4; + +export function annotateDiffRegions(diff: PNG, regions: ScreenshotDiffRegion[]): void { + for (const region of regions) { + if ( + region.rect.width < MIN_ANNOTATED_REGION_SIDE || + region.rect.height < MIN_ANNOTATED_REGION_SIDE + ) { + continue; + } + drawRect(diff, region.rect); + } +} + +function drawRect(diff: PNG, rect: ScreenshotDiffRegion['rect']): void { + const minX = clamp(rect.x, 0, diff.width - 1); + const minY = clamp(rect.y, 0, diff.height - 1); + const maxX = clamp(rect.x + rect.width - 1, 0, diff.width - 1); + const maxY = clamp(rect.y + rect.height - 1, 0, diff.height - 1); + for (let thickness = 0; thickness < REGION_BORDER_THICKNESS; thickness += 1) { + for (let x = minX; x <= maxX; x += 1) { + setPixel(diff, x, minY + thickness, REGION_BORDER_COLOR); + setPixel(diff, x, maxY - thickness, REGION_BORDER_COLOR); + } + for (let y = minY; y <= maxY; y += 1) { + setPixel(diff, minX + thickness, y, REGION_BORDER_COLOR); + setPixel(diff, maxX - thickness, y, REGION_BORDER_COLOR); + } + } +} + +function setPixel( + diff: PNG, + x: number, + y: number, + color: readonly [number, number, number, number], +): void { + if (x < 0 || x >= diff.width || y < 0 || y >= diff.height) return; + const index = (y * diff.width + x) * 4; + diff.data[index] = color[0]; + diff.data[index + 1] = color[1]; + diff.data[index + 2] = color[2]; + diff.data[index + 3] = color[3]; +} + +function clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); +} diff --git a/src/utils/screenshot-diff-region-split.ts b/src/utils/screenshot-diff-region-split.ts new file mode 100644 index 000000000..3b53355c5 --- /dev/null +++ b/src/utils/screenshot-diff-region-split.ts @@ -0,0 +1,171 @@ +import { PNG } from 'pngjs'; +import type { MutableDiffRegion } from './screenshot-diff-regions.ts'; + +const MIN_SPLIT_REGION_HEIGHT = 180; +const MIN_SPLIT_REGION_WIDTH_RATIO = 0.35; +const MIN_SPLIT_SEGMENT_HEIGHT = 80; +const LOW_DENSITY_RATIO = 0.08; +const MIN_LOW_DENSITY_BAND_HEIGHT = 6; +const ROW_SMOOTHING_RADIUS = 3; + +export function splitLargeDiffRegions( + regions: MutableDiffRegion[], + params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, +): MutableDiffRegion[] { + return regions.flatMap((region) => + shouldSplitRegion(region, params.baseline.width) + ? splitRegionByHorizontalDensity(region, params) + : [region], + ); +} + +function shouldSplitRegion(region: MutableDiffRegion, imageWidth: number): boolean { + const width = region.maxX - region.minX + 1; + const height = region.maxY - region.minY + 1; + return height >= MIN_SPLIT_REGION_HEIGHT && width >= imageWidth * MIN_SPLIT_REGION_WIDTH_RATIO; +} + +function splitRegionByHorizontalDensity( + region: MutableDiffRegion, + params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, +): MutableDiffRegion[] { + const rowCounts = measureRowDiffCounts(region, params.diffMask, params.baseline.width); + const smoothed = smoothCounts(rowCounts); + const lowDensityBands = findLowDensityBands( + smoothed, + Math.max(1, Math.round((region.maxX - region.minX + 1) * LOW_DENSITY_RATIO)), + ); + const ranges = buildSegmentRanges(region, lowDensityBands); + if (ranges.length <= 1) return [region]; + + const splitRegions = ranges + .map(([minY, maxY]) => buildRegionSlice(region, minY, maxY, params)) + .filter((slice): slice is MutableDiffRegion => slice !== null); + return splitRegions.length > 1 ? splitRegions : [region]; +} + +function measureRowDiffCounts( + region: MutableDiffRegion, + diffMask: Uint8Array, + imageWidth: number, +): number[] { + const counts: number[] = []; + for (let y = region.minY; y <= region.maxY; y += 1) { + let count = 0; + for (let x = region.minX; x <= region.maxX; x += 1) { + if (diffMask[y * imageWidth + x] === 1) count += 1; + } + counts.push(count); + } + return counts; +} + +function smoothCounts(counts: number[]): number[] { + return counts.map((_, index) => { + let sum = 0; + let samples = 0; + const start = Math.max(0, index - ROW_SMOOTHING_RADIUS); + const end = Math.min(counts.length - 1, index + ROW_SMOOTHING_RADIUS); + for (let sample = start; sample <= end; sample += 1) { + sum += counts[sample]!; + samples += 1; + } + return Math.round(sum / samples); + }); +} + +function findLowDensityBands(counts: number[], threshold: number): Array<[number, number]> { + const bands: Array<[number, number]> = []; + let start: number | null = null; + for (let index = 0; index < counts.length; index += 1) { + if (counts[index]! <= threshold) { + start ??= index; + continue; + } + if (start !== null) { + if (index - start >= MIN_LOW_DENSITY_BAND_HEIGHT) bands.push([start, index - 1]); + start = null; + } + } + if (start !== null && counts.length - start >= MIN_LOW_DENSITY_BAND_HEIGHT) { + bands.push([start, counts.length - 1]); + } + return bands; +} + +function buildSegmentRanges( + region: MutableDiffRegion, + lowDensityBands: Array<[number, number]>, +): Array<[number, number]> { + const ranges: Array<[number, number]> = []; + let segmentStart = region.minY; + for (const [relativeStart, relativeEnd] of lowDensityBands) { + const cutY = region.minY + Math.round((relativeStart + relativeEnd) / 2); + if ( + cutY - segmentStart + 1 < MIN_SPLIT_SEGMENT_HEIGHT || + region.maxY - cutY < MIN_SPLIT_SEGMENT_HEIGHT + ) { + continue; + } + ranges.push([segmentStart, cutY]); + segmentStart = cutY + 1; + } + ranges.push([segmentStart, region.maxY]); + return ranges; +} + +function buildRegionSlice( + region: MutableDiffRegion, + minY: number, + maxY: number, + params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, +): MutableDiffRegion | null { + let slice: MutableDiffRegion | null = null; + for (let y = minY; y <= maxY; y += 1) { + for (let x = region.minX; x <= region.maxX; x += 1) { + const pixelIndex = y * params.baseline.width + x; + if (params.diffMask[pixelIndex] !== 1) continue; + slice ??= createEmptyRegion(x, y); + addPixelToSlice(slice, pixelIndex, x, y, params.baseline, params.current); + } + } + return slice; +} + +function createEmptyRegion(x: number, y: number): MutableDiffRegion { + return { + minX: x, + minY: y, + maxX: x, + maxY: y, + differentPixels: 0, + baselineRed: 0, + baselineGreen: 0, + baselineBlue: 0, + currentRed: 0, + currentGreen: 0, + currentBlue: 0, + }; +} + +function addPixelToSlice( + slice: MutableDiffRegion, + pixelIndex: number, + x: number, + y: number, + baseline: PNG, + current: PNG, +): void { + const dataIndex = pixelIndex * 4; + slice.minX = Math.min(slice.minX, x); + slice.minY = Math.min(slice.minY, y); + slice.maxX = Math.max(slice.maxX, x); + slice.maxY = Math.max(slice.maxY, y); + slice.differentPixels += 1; + slice.baselineRed += baseline.data[dataIndex]!; + slice.baselineGreen += baseline.data[dataIndex + 1]!; + slice.baselineBlue += baseline.data[dataIndex + 2]!; + slice.currentRed += current.data[dataIndex]!; + slice.currentGreen += current.data[dataIndex + 1]!; + slice.currentBlue += current.data[dataIndex + 2]!; +} diff --git a/src/utils/screenshot-diff-regions.ts b/src/utils/screenshot-diff-regions.ts new file mode 100644 index 000000000..b0fd2fce7 --- /dev/null +++ b/src/utils/screenshot-diff-regions.ts @@ -0,0 +1,370 @@ +import { PNG } from 'pngjs'; +import { splitLargeDiffRegions } from './screenshot-diff-region-split.ts'; + +export type ScreenshotDiffColor = { + r: number; + g: number; + b: number; +}; + +export type ScreenshotDiffRegion = { + index: number; + rect: { x: number; y: number; width: number; height: number }; + center: { x: number; y: number }; + normalizedRect: { x: number; y: number; width: number; height: number }; + differentPixels: number; + shareOfDiffPercentage: number; + imagePercentage: number; + densityPercentage: number; + shape: 'compact' | 'horizontal-band' | 'vertical-band' | 'large-area'; + size: 'small' | 'medium' | 'large'; + location: string; + averageBaselineColor: ScreenshotDiffColor; + averageCurrentColor: ScreenshotDiffColor; + averageBaselineColorHex: string; + averageCurrentColorHex: string; + baselineLuminance: number; + currentLuminance: number; + dominantChange: 'brighter' | 'darker' | 'color-shift' | 'mixed'; + description: string; + currentOverlayMatches?: ScreenshotDiffRegionOverlayMatch[]; +}; + +export type ScreenshotDiffRegionOverlayMatch = { + ref: string; + label?: string; + overlapPercentage: number; + regionCoveragePercentage: number; + rect: { x: number; y: number; width: number; height: number }; +}; + +const DEFAULT_MAX_DIFF_REGIONS = 8; +const REGION_MERGE_GAP_PX = 12; + +export type MutableDiffRegion = { + minX: number; + minY: number; + maxX: number; + maxY: number; + differentPixels: number; + baselineRed: number; + baselineGreen: number; + baselineBlue: number; + currentRed: number; + currentGreen: number; + currentBlue: number; +}; + +export function summarizeDiffRegions(params: { + diffMask: Uint8Array; + baseline: PNG; + current: PNG; + totalPixels: number; + differentPixels: number; + maxRegions?: number; +}): ScreenshotDiffRegion[] { + const rawRegions = findConnectedDiffRegions(params); + const mergedRegions = + rawRegions.length <= 2000 ? mergeNearbyRegions(rawRegions, REGION_MERGE_GAP_PX) : rawRegions; + const splitRegions = splitLargeDiffRegions(mergedRegions, params); + return splitRegions + .sort((left, right) => { + const pixelDelta = right.differentPixels - left.differentPixels; + if (pixelDelta !== 0) return pixelDelta; + const topDelta = left.minY - right.minY; + if (topDelta !== 0) return topDelta; + return left.minX - right.minX; + }) + .slice(0, Math.max(0, params.maxRegions ?? DEFAULT_MAX_DIFF_REGIONS)) + .map((region, index) => + toScreenshotDiffRegion(region, index + 1, { + width: params.baseline.width, + height: params.baseline.height, + totalPixels: params.totalPixels, + differentPixels: params.differentPixels, + }), + ); +} + +function findConnectedDiffRegions(params: { + diffMask: Uint8Array; + baseline: PNG; + current: PNG; +}): MutableDiffRegion[] { + const { diffMask, baseline, current } = params; + const { width, height } = baseline; + const visited = new Uint8Array(diffMask.length); + const queue = new Int32Array(diffMask.length); + const regions: MutableDiffRegion[] = []; + + for (let pixelIndex = 0; pixelIndex < diffMask.length; pixelIndex += 1) { + if (diffMask[pixelIndex] !== 1 || visited[pixelIndex] === 1) continue; + + let queueStart = 0; + let queueEnd = 0; + queue[queueEnd] = pixelIndex; + queueEnd += 1; + visited[pixelIndex] = 1; + + const startX = pixelIndex % width; + const startY = Math.floor(pixelIndex / width); + const region: MutableDiffRegion = { + minX: startX, + minY: startY, + maxX: startX, + maxY: startY, + differentPixels: 0, + baselineRed: 0, + baselineGreen: 0, + baselineBlue: 0, + currentRed: 0, + currentGreen: 0, + currentBlue: 0, + }; + + while (queueStart < queueEnd) { + const currentPixelIndex = queue[queueStart]!; + queueStart += 1; + addPixelToRegion(region, currentPixelIndex, width, baseline, current); + + const x = currentPixelIndex % width; + const y = Math.floor(currentPixelIndex / width); + for (let yOffset = -1; yOffset <= 1; yOffset += 1) { + const neighborY = y + yOffset; + if (neighborY < 0 || neighborY >= height) continue; + for (let xOffset = -1; xOffset <= 1; xOffset += 1) { + if (xOffset === 0 && yOffset === 0) continue; + const neighborX = x + xOffset; + if (neighborX < 0 || neighborX >= width) continue; + const neighborIndex = neighborY * width + neighborX; + if (diffMask[neighborIndex] !== 1 || visited[neighborIndex] === 1) continue; + visited[neighborIndex] = 1; + queue[queueEnd] = neighborIndex; + queueEnd += 1; + } + } + } + + regions.push(region); + } + + return regions; +} + +function addPixelToRegion( + region: MutableDiffRegion, + pixelIndex: number, + width: number, + baseline: PNG, + current: PNG, +): void { + const x = pixelIndex % width; + const y = Math.floor(pixelIndex / width); + const dataIndex = pixelIndex * 4; + region.minX = Math.min(region.minX, x); + region.minY = Math.min(region.minY, y); + region.maxX = Math.max(region.maxX, x); + region.maxY = Math.max(region.maxY, y); + region.differentPixels += 1; + region.baselineRed += baseline.data[dataIndex]!; + region.baselineGreen += baseline.data[dataIndex + 1]!; + region.baselineBlue += baseline.data[dataIndex + 2]!; + region.currentRed += current.data[dataIndex]!; + region.currentGreen += current.data[dataIndex + 1]!; + region.currentBlue += current.data[dataIndex + 2]!; +} + +function mergeNearbyRegions(regions: MutableDiffRegion[], gapPx: number): MutableDiffRegion[] { + const merged: MutableDiffRegion[] = []; + for (const region of regions.sort((left, right) => { + const topDelta = left.minY - right.minY; + if (topDelta !== 0) return topDelta; + return left.minX - right.minX; + })) { + const existing = merged.find((candidate) => regionsAreNear(candidate, region, gapPx)); + if (!existing) { + merged.push({ ...region }); + continue; + } + mergeRegionInto(existing, region); + } + return merged; +} + +function regionsAreNear(left: MutableDiffRegion, right: MutableDiffRegion, gapPx: number): boolean { + return ( + left.minX - gapPx <= right.maxX && + right.minX - gapPx <= left.maxX && + left.minY - gapPx <= right.maxY && + right.minY - gapPx <= left.maxY + ); +} + +function mergeRegionInto(target: MutableDiffRegion, source: MutableDiffRegion): void { + target.minX = Math.min(target.minX, source.minX); + target.minY = Math.min(target.minY, source.minY); + target.maxX = Math.max(target.maxX, source.maxX); + target.maxY = Math.max(target.maxY, source.maxY); + target.differentPixels += source.differentPixels; + target.baselineRed += source.baselineRed; + target.baselineGreen += source.baselineGreen; + target.baselineBlue += source.baselineBlue; + target.currentRed += source.currentRed; + target.currentGreen += source.currentGreen; + target.currentBlue += source.currentBlue; +} + +function toScreenshotDiffRegion( + region: MutableDiffRegion, + index: number, + image: { width: number; height: number; totalPixels: number; differentPixels: number }, +): ScreenshotDiffRegion { + const rect = { + x: region.minX, + y: region.minY, + width: region.maxX - region.minX + 1, + height: region.maxY - region.minY + 1, + }; + const center = { + x: Math.round(region.minX + rect.width / 2), + y: Math.round(region.minY + rect.height / 2), + }; + const averageBaselineColor = averageRegionColor( + region.baselineRed, + region.baselineGreen, + region.baselineBlue, + region.differentPixels, + ); + const averageCurrentColor = averageRegionColor( + region.currentRed, + region.currentGreen, + region.currentBlue, + region.differentPixels, + ); + const regionArea = rect.width * rect.height; + const densityPercentage = roundPercentage(region.differentPixels / regionArea); + const baselineLuminance = Math.round(luminance(averageBaselineColor)); + const currentLuminance = Math.round(luminance(averageCurrentColor)); + const shape = describeRegionShape(rect, image.width, image.height); + const size = describeRegionSize(regionArea, image.totalPixels); + const dominantChange = describeDominantChange(averageBaselineColor, averageCurrentColor); + const location = describeRegionLocation(center, image.width, image.height); + return { + index, + rect, + center, + normalizedRect: { + x: roundPercentage(rect.x / image.width), + y: roundPercentage(rect.y / image.height), + width: roundPercentage(rect.width / image.width), + height: roundPercentage(rect.height / image.height), + }, + differentPixels: region.differentPixels, + shareOfDiffPercentage: roundPercentage(region.differentPixels / image.differentPixels), + imagePercentage: roundPercentage(region.differentPixels / image.totalPixels), + densityPercentage, + shape, + size, + location, + averageBaselineColor, + averageCurrentColor, + averageBaselineColorHex: toHexColor(averageBaselineColor), + averageCurrentColorHex: toHexColor(averageCurrentColor), + baselineLuminance, + currentLuminance, + dominantChange, + description: + `${size} region (${shape}) in the ${location}; ` + + `${densityPercentage}% of this region's pixels differ; ` + + `current is ${formatDominantChange(dominantChange)}.`, + }; +} + +function averageRegionColor( + red: number, + green: number, + blue: number, + pixels: number, +): ScreenshotDiffColor { + return { + r: Math.round(red / pixels), + g: Math.round(green / pixels), + b: Math.round(blue / pixels), + }; +} + +function describeRegionLocation( + center: { x: number; y: number }, + width: number, + height: number, +): string { + const horizontal = + center.x < width / 3 ? 'left' : center.x > (width * 2) / 3 ? 'right' : 'center'; + const vertical = + center.y < height / 3 ? 'top' : center.y > (height * 2) / 3 ? 'bottom' : 'middle'; + return horizontal === 'center' && vertical === 'middle' ? 'center' : `${vertical}-${horizontal}`; +} + +function describeDominantChange( + baseline: ScreenshotDiffColor, + current: ScreenshotDiffColor, +): ScreenshotDiffRegion['dominantChange'] { + const baselineLuminance = luminance(baseline); + const currentLuminance = luminance(current); + const luminanceDelta = currentLuminance - baselineLuminance; + if (Math.abs(luminanceDelta) >= 12) return luminanceDelta > 0 ? 'brighter' : 'darker'; + + const maxChannelDelta = Math.max( + Math.abs(current.r - baseline.r), + Math.abs(current.g - baseline.g), + Math.abs(current.b - baseline.b), + ); + return maxChannelDelta >= 12 ? 'color-shift' : 'mixed'; +} + +function describeRegionShape( + rect: { width: number; height: number }, + imageWidth: number, + imageHeight: number, +): ScreenshotDiffRegion['shape'] { + if (rect.width >= imageWidth * 0.55 && rect.height >= imageHeight * 0.12) return 'large-area'; + if (rect.width >= rect.height * 2.5) return 'horizontal-band'; + if (rect.height >= rect.width * 2.5) return 'vertical-band'; + return 'compact'; +} + +function describeRegionSize(regionArea: number, totalPixels: number): ScreenshotDiffRegion['size'] { + const areaRatio = regionArea / totalPixels; + if (areaRatio >= 0.04) return 'large'; + if (areaRatio >= 0.01) return 'medium'; + return 'small'; +} + +function formatDominantChange(change: ScreenshotDiffRegion['dominantChange']): string { + switch (change) { + case 'brighter': + return 'brighter'; + case 'darker': + return 'darker'; + case 'color-shift': + return 'color-shifted'; + default: + return 'mixed'; + } +} + +function luminance(color: ScreenshotDiffColor): number { + return color.r * 0.2126 + color.g * 0.7152 + color.b * 0.0722; +} + +function toHexColor(color: ScreenshotDiffColor): string { + return `#${toHexChannel(color.r)}${toHexChannel(color.g)}${toHexChannel(color.b)}`; +} + +function toHexChannel(value: number): string { + return value.toString(16).padStart(2, '0'); +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index f1581c1a2..d30d1569c 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -3,6 +3,18 @@ import path from 'node:path'; import { PNG } from 'pngjs'; import { AppError } from '../utils/errors.ts'; import { decodePng } from './png.ts'; +import { annotateDiffRegions } from './screenshot-diff-region-overlay.ts'; +import { + summarizeNonTextDiffDeltas, + type ScreenshotNonTextDelta, +} from './screenshot-diff-non-text.ts'; +import { + summarizeScreenshotOcr, + toScreenshotOcrSummary, + type ScreenshotOcrSummary, +} from './screenshot-diff-ocr.ts'; +import { summarizeDiffRegions, type ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; +import type { ScreenshotOverlayRef } from './snapshot.ts'; export type ScreenshotDimensionMismatch = { expected: { width: number; height: number }; @@ -16,11 +28,17 @@ export type ScreenshotDiffResult = { mismatchPercentage: number; match: boolean; dimensionMismatch?: ScreenshotDimensionMismatch; + regions?: ScreenshotDiffRegion[]; + currentOverlayPath?: string; + currentOverlayRefs?: ScreenshotOverlayRef[]; + ocr?: ScreenshotOcrSummary; + nonTextDeltas?: ScreenshotNonTextDelta[]; }; export type ScreenshotDiffOptions = { threshold?: number; outputPath?: string; + maxRegions?: number; }; // Each pixel is a point in 3D RGB space (R, G, B each 0–255). @@ -29,6 +47,9 @@ export type ScreenshotDiffOptions = { // We use this as the denominator so threshold 0–1 maps linearly to the full // color distance range: 0 = exact match only, 1 = everything matches. const COLOR_DISTANCE_SCALE = 255 * Math.sqrt(3); +const DIFF_CONTEXT_LIGHTEN_RATIO = 0.72; +const DIFF_CHANGE_TINT_RATIO = 0.78; +const DIFF_CHANGE_COLOR = { r: 220, g: 0, b: 0 } as const; export async function compareScreenshots( baselinePath: string, @@ -69,12 +90,13 @@ export async function compareScreenshots( const totalPixels = baseline.width * baseline.height; const maxColorDistance = threshold * COLOR_DISTANCE_SCALE; const diff = new PNG({ width: baseline.width, height: baseline.height }); + const diffMask = new Uint8Array(totalPixels); let differentPixels = 0; // PNG data is a flat RGBA buffer: [R, G, B, A, R, G, B, A, ...]. // We step by 4 to visit each pixel and compute its Euclidean distance // in RGB space between the baseline and current image. - for (let index = 0; index < baseline.data.length; index += 4) { + for (let index = 0, pixelIndex = 0; index < baseline.data.length; index += 4, pixelIndex += 1) { const redDelta = baseline.data[index]! - current.data[index]!; const greenDelta = baseline.data[index + 1]! - current.data[index + 1]!; const blueDelta = baseline.data[index + 2]! - current.data[index + 2]!; @@ -82,34 +104,64 @@ export async function compareScreenshots( if (colorDistance > maxColorDistance) { differentPixels += 1; - // Red highlight for different pixels - diff.data[index] = 255; - diff.data[index + 1] = 0; - diff.data[index + 2] = 0; + diffMask[pixelIndex] = 1; + const context = renderDiffContextPixel(current, index); + diff.data[index] = tintChannel(context.r, DIFF_CHANGE_COLOR.r, DIFF_CHANGE_TINT_RATIO); + diff.data[index + 1] = tintChannel(context.g, DIFF_CHANGE_COLOR.g, DIFF_CHANGE_TINT_RATIO); + diff.data[index + 2] = tintChannel(context.b, DIFF_CHANGE_COLOR.b, DIFF_CHANGE_TINT_RATIO); diff.data[index + 3] = 255; continue; } - // Unchanged pixels are converted to a dimmed grayscale (30% brightness). - // This makes the diff image look like a faded version of the original with - // red pixels popping out where differences exist. - const gray = Math.round( - (baseline.data[index]! + baseline.data[index + 1]! + baseline.data[index + 2]!) / 3, - ); - const dimmed = Math.round(gray * 0.3); - diff.data[index] = dimmed; - diff.data[index + 1] = dimmed; - diff.data[index + 2] = dimmed; + const context = renderDiffContextPixel(current, index); + diff.data[index] = context.r; + diff.data[index + 1] = context.g; + diff.data[index + 2] = context.b; diff.data[index + 3] = 255; } + const regions = + differentPixels > 0 + ? summarizeDiffRegions({ + diffMask, + baseline, + current, + totalPixels, + differentPixels, + maxRegions: options.maxRegions, + }) + : []; + if (differentPixels > 0 && diffOutputPath) { + annotateDiffRegions(diff, regions); await fs.mkdir(path.dirname(diffOutputPath), { recursive: true }); await fs.writeFile(diffOutputPath, PNG.sync.write(diff)); } else { await removeStaleDiffOutput(options.outputPath); } + const ocrAnalysis = + differentPixels > 0 + ? await summarizeScreenshotOcr({ + baselinePath, + currentPath, + width: baseline.width, + height: baseline.height, + }) + : undefined; + const ocr = + ocrAnalysis && ocrAnalysis.matches.length > 0 ? toScreenshotOcrSummary(ocrAnalysis) : undefined; + const nonTextDeltas = + differentPixels > 0 + ? summarizeNonTextDiffDeltas({ + diffMask, + width: baseline.width, + height: baseline.height, + regions, + ocr: ocrAnalysis, + }) + : []; + // Round to 2 decimal places: multiply percentage by 100 before rounding, // then divide back. e.g. 0.12345 → 12.345% → round(1234.5)/100 → 12.35% const mismatchPercentage = @@ -117,6 +169,9 @@ export async function compareScreenshots( return { ...(differentPixels > 0 && diffOutputPath ? { diffPath: diffOutputPath } : {}), + ...(regions.length > 0 ? { regions } : {}), + ...(ocr ? { ocr } : {}), + ...(nonTextDeltas.length > 0 ? { nonTextDeltas } : {}), totalPixels, differentPixels, mismatchPercentage, @@ -144,3 +199,15 @@ async function removeStaleDiffOutput(outputPath: string | undefined): Promise `. - Burned-in touch overlays are exported only on macOS hosts, because the overlay pipeline depends on Swift + AVFoundation helpers. - On Linux or other non-macOS hosts, `record stop` still succeeds and returns the raw video plus telemetry sidecar, and includes `overlayWarning` when burn-in overlays were skipped. From e63116aef410120a4a6daf7795b09be362929736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 11:38:24 +0200 Subject: [PATCH 2/8] fix: address screenshot diff review feedback --- .../agent-device/references/verification.md | 4 ++- src/__tests__/cli-diff.test.ts | 5 +++ src/cli/commands/screenshot.ts | 14 ++++++++ .../__tests__/screenshot-diff-ocr.test.ts | 34 ++++++++++++++++++- src/utils/__tests__/screenshot-diff.test.ts | 31 +++++++++++++++++ src/utils/output.ts | 12 +++---- src/utils/screenshot-diff-non-text.ts | 18 ++++++---- src/utils/screenshot-diff-ocr.ts | 15 +++++--- src/utils/screenshot-diff-regions.ts | 2 ++ src/utils/screenshot-diff.ts | 23 ++++++------- 10 files changed, 127 insertions(+), 31 deletions(-) diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 3e0be202e..dc8f7353c 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -46,7 +46,9 @@ agent-device diff snapshot -i Use `screenshot` when the proof needs a rendered image instead of a structural tree. - Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot. -- Use `diff screenshot --baseline --out ` when comparing against a saved visual baseline. The text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and a short description so an implementation agent can focus on the biggest visual mismatches instead of a single global pixel percentage. The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. +- Use `diff screenshot --baseline --out ` when comparing against a saved visual baseline. + - Text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and short descriptions. + - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. - Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, separators, and card/background movement, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index 74682fa27..b3b775709 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -307,12 +307,15 @@ describe('cli diff commands', () => { const originalHome = process.env.HOME; const baselineRelative = path.join('fixtures', 'baseline.png'); const diffRelative = path.join('fixtures', 'diff.png'); + const overlayRelative = path.join('fixtures', 'diff.current-overlay.png'); const baseline = path.join(fakeHome, baselineRelative); const diffOut = path.join(fakeHome, diffRelative); + const overlayOut = path.join(fakeHome, overlayRelative); fs.mkdirSync(path.dirname(baseline), { recursive: true }); fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 })); fs.writeFileSync(diffOut, 'stale diff'); + fs.writeFileSync(overlayOut, 'stale overlay'); process.env.HOME = fakeHome; try { @@ -324,6 +327,7 @@ describe('cli diff commands', () => { `~/${baselineRelative}`, '--out', `~/${diffRelative}`, + '--overlay-refs', '--json', ], { preserveHome: true }, @@ -335,6 +339,7 @@ describe('cli diff commands', () => { assert.equal(payload.success, true); assert.equal(payload.data.match, true); assert.equal(fs.existsSync(diffOut), false); + assert.equal(fs.existsSync(overlayOut), false); } finally { if (typeof originalHome === 'string') process.env.HOME = originalHome; else delete process.env.HOME; diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index 7016e0170..9fbbfdd8e 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -87,6 +87,8 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl } : {}), }; + } else if (flags.overlayRefs && outputPath) { + removeStaleCurrentOverlay(outputPath); } } finally { try { @@ -106,3 +108,15 @@ function deriveCurrentOverlayPath(outputPath: string): string { const base = extension ? outputPath.slice(0, -extension.length) : outputPath; return `${base}.current-overlay${extension || '.png'}`; } + +function removeStaleCurrentOverlay(outputPath: string): void { + try { + fs.unlinkSync(deriveCurrentOverlayPath(outputPath)); + } catch (error) { + if (!isFsError(error, 'ENOENT')) throw error; + } +} + +function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException { + return typeof error === 'object' && error !== null && 'code' in error && error.code === code; +} diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts index ff6f51120..73b720d32 100644 --- a/src/utils/__tests__/screenshot-diff-ocr.test.ts +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -1,6 +1,13 @@ import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; import { test } from 'vitest'; -import { matchOcrBlocks, parseTesseractTsv } from '../screenshot-diff-ocr.ts'; +import { + matchOcrBlocks, + parseTesseractTsv, + summarizeScreenshotOcr, +} from '../screenshot-diff-ocr.ts'; test('parseTesseractTsv groups word rows into text line blocks', () => { const blocks = parseTesseractTsv( @@ -69,3 +76,28 @@ test('matchOcrBlocks reports movement and possible text metric mismatch', () => 'Text "Wi-Fi" moved 12px right, 8px up; text box is 10px wider; possible font, weight, or text rendering mismatch.', ); }); + +test('summarizeScreenshotOcr returns undefined when tesseract exits non-zero', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'ocr-test-')); + const binDir = path.join(dir, 'bin'); + const fakeTesseract = path.join(binDir, 'tesseract'); + const originalPath = process.env.PATH; + fs.mkdirSync(binDir, { recursive: true }); + fs.writeFileSync(fakeTesseract, '#!/bin/sh\nexit 2\n'); + fs.chmodSync(fakeTesseract, 0o755); + process.env.PATH = `${binDir}${path.delimiter}${originalPath ?? ''}`; + + try { + const result = await summarizeScreenshotOcr({ + baselinePath: path.join(dir, 'baseline.png'), + currentPath: path.join(dir, 'current.png'), + width: 100, + height: 100, + }); + assert.equal(result, undefined); + } finally { + if (originalPath === undefined) delete process.env.PATH; + else process.env.PATH = originalPath; + fs.rmSync(dir, { recursive: true, force: true }); + } +}); diff --git a/src/utils/__tests__/screenshot-diff.test.ts b/src/utils/__tests__/screenshot-diff.test.ts index 6dd98be99..b4c8240fd 100644 --- a/src/utils/__tests__/screenshot-diff.test.ts +++ b/src/utils/__tests__/screenshot-diff.test.ts @@ -186,6 +186,34 @@ test('large connected diff regions are split at horizontal low-density bands', a ]); }); +test('large connected diff regions are not split at short low-density bands', async () => { + const dir = tmpDir(); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + + writeSolidPng(baseline, 100, 220, { r: 0, g: 0, b: 0 }); + + const currentPng = new PNG({ width: 100, height: 220 }); + for (let i = 0; i < currentPng.data.length; i += 4) { + currentPng.data[i] = 0; + currentPng.data[i + 1] = 0; + currentPng.data[i + 2] = 0; + currentPng.data[i + 3] = 255; + } + paintRect(currentPng, { x: 0, y: 0, width: 100, height: 80 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 50, y: 80, width: 1, height: 4 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 0, y: 84, width: 100, height: 136 }, { r: 255, g: 255, b: 255 }); + fs.writeFileSync(current, PNG.sync.write(currentPng)); + + const result = await compareScreenshots(baseline, current, { + outputPath: path.join(dir, 'diff.png'), + threshold: 0, + }); + + assert.equal(result.regions?.length, 1); + assert.deepEqual(result.regions?.[0]?.rect, { x: 0, y: 0, width: 100, height: 220 }); +}); + test('no diff path is persisted when outputPath is omitted', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); @@ -269,6 +297,9 @@ test('dimension mismatch returns expected vs actual sizes', async () => { assert.equal(result.match, false); assert.equal(result.mismatchPercentage, 100); assert.equal(result.diffPath, undefined, 'diffPath should not be set for dimension mismatch'); + assert.equal(result.regions, undefined); + assert.equal(result.ocr, undefined); + assert.equal(result.nonTextDeltas, undefined); assert.deepEqual(result.dimensionMismatch, { expected: { width: 10, height: 20 }, actual: { width: 15, height: 25 }, diff --git a/src/utils/output.ts b/src/utils/output.ts index 20ff14f7f..9ead12f9d 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -281,15 +281,15 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { lines.push( ' item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | textRatio | confidence | issueHint', ); - for (const [index, match] of shownOcrMatches.entries()) { - const delta = match.delta; + for (const [index, ocrMatch] of shownOcrMatches.entries()) { + const delta = ocrMatch.delta; lines.push( - ` ${index + 1} | ${JSON.stringify(match.text)} | ` + + ` ${index + 1} | ${JSON.stringify(ocrMatch.text)} | ` + `${formatSignedPixels(delta.x)},${formatSignedPixels(delta.y)} | ` + `${formatSignedPixels(delta.width)},${formatSignedPixels(delta.height)} | ` + - `${formatRect(match.baselineRect)} | ${formatRect(match.currentRect)} | ` + - `w=${match.widthRatio} h=${match.heightRatio} | ${match.confidence} | ` + - `${match.possibleTextMetricMismatch ? 'possible-text-metric-mismatch' : '-'}`, + `${formatRect(ocrMatch.baselineRect)} | ${formatRect(ocrMatch.currentRect)} | ` + + `w=${ocrMatch.widthRatio} h=${ocrMatch.heightRatio} | ${ocrMatch.confidence} | ` + + `${ocrMatch.possibleTextMetricMismatch ? 'possible-text-metric-mismatch' : '-'}`, ); } } diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts index 6acbd18f1..0fac21d41 100644 --- a/src/utils/screenshot-diff-non-text.ts +++ b/src/utils/screenshot-diff-non-text.ts @@ -58,13 +58,17 @@ export function summarizeNonTextDiffDeltas(params: { const rawComponents = findConnectedComponents(maskedDiff, params.width, params.height); const mergedComponents = mergeNearbyComponents(rawComponents, MERGE_GAP_PX); const textBlocks = getOcrBlocks(params.ocr); - return mergedComponents - .filter(hasUsefulComponentSize) - .map((component) => toNonTextDelta(component, params, textBlocks)) - .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) - .sort((left, right) => scoreNonTextDelta(right) - scoreNonTextDelta(left)) - .slice(0, Math.max(0, params.maxDeltas ?? MAX_NON_TEXT_DELTAS)) - .map((delta, index) => ({ ...delta, index: index + 1 })); + return ( + mergedComponents + .filter(hasUsefulComponentSize) + .map((component) => toNonTextDelta(component, params, textBlocks)) + // Status bars and top chrome tend to produce noisy residuals around time, + // signal, and battery text; changed regions still report that area. + .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) + .sort((left, right) => scoreNonTextDelta(right) - scoreNonTextDelta(left)) + .slice(0, Math.max(0, params.maxDeltas ?? MAX_NON_TEXT_DELTAS)) + .map((delta, index) => ({ ...delta, index: index + 1 })) + ); } function maskOcrText( diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts index 1b4679762..b1aa78af2 100644 --- a/src/utils/screenshot-diff-ocr.ts +++ b/src/utils/screenshot-diff-ocr.ts @@ -107,6 +107,7 @@ export function parseTesseractTsv( const level = readTsvNumber(values, indexByName, 'level'); const rawText = readTsvString(values, indexByName, 'text').trim(); const confidence = readTsvNumber(values, indexByName, 'conf'); + // Tesseract TSV uses level=5 for word rows; higher-level rows are page/block/line containers. if (level !== 5 || !isMeaningfulText(rawText) || confidence < 0) continue; const left = readTsvNumber(values, indexByName, 'left'); @@ -327,10 +328,16 @@ function scoreOcrMatch(match: ScreenshotOcrTextMatch): number { } function unionRects(rects: Rect[]): Rect { - const minX = Math.min(...rects.map((rect) => rect.x)); - const minY = Math.min(...rects.map((rect) => rect.y)); - const maxX = Math.max(...rects.map((rect) => rect.x + rect.width)); - const maxY = Math.max(...rects.map((rect) => rect.y + rect.height)); + let minX = Number.POSITIVE_INFINITY; + let minY = Number.POSITIVE_INFINITY; + let maxX = Number.NEGATIVE_INFINITY; + let maxY = Number.NEGATIVE_INFINITY; + for (const rect of rects) { + minX = Math.min(minX, rect.x); + minY = Math.min(minY, rect.y); + maxX = Math.max(maxX, rect.x + rect.width); + maxY = Math.max(maxY, rect.y + rect.height); + } return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; } diff --git a/src/utils/screenshot-diff-regions.ts b/src/utils/screenshot-diff-regions.ts index b0fd2fce7..0d5c049a9 100644 --- a/src/utils/screenshot-diff-regions.ts +++ b/src/utils/screenshot-diff-regions.ts @@ -64,6 +64,8 @@ export function summarizeDiffRegions(params: { maxRegions?: number; }): ScreenshotDiffRegion[] { const rawRegions = findConnectedDiffRegions(params); + // Avoid quadratic nearby-merge work on extremely noisy diffs; the later ranking + // still keeps the largest components, but tiny speckles may remain unmerged. const mergedRegions = rawRegions.length <= 2000 ? mergeNearbyRegions(rawRegions, REGION_MERGE_GAP_PX) : rawRegions; const splitRegions = splitLargeDiffRegions(mergedRegions, params); diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index d30d1569c..e86b3d313 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -105,18 +105,18 @@ export async function compareScreenshots( if (colorDistance > maxColorDistance) { differentPixels += 1; diffMask[pixelIndex] = 1; - const context = renderDiffContextPixel(current, index); - diff.data[index] = tintChannel(context.r, DIFF_CHANGE_COLOR.r, DIFF_CHANGE_TINT_RATIO); - diff.data[index + 1] = tintChannel(context.g, DIFF_CHANGE_COLOR.g, DIFF_CHANGE_TINT_RATIO); - diff.data[index + 2] = tintChannel(context.b, DIFF_CHANGE_COLOR.b, DIFF_CHANGE_TINT_RATIO); + const context = renderDiffContextChannel(current, index); + diff.data[index] = tintChannel(context, DIFF_CHANGE_COLOR.r, DIFF_CHANGE_TINT_RATIO); + diff.data[index + 1] = tintChannel(context, DIFF_CHANGE_COLOR.g, DIFF_CHANGE_TINT_RATIO); + diff.data[index + 2] = tintChannel(context, DIFF_CHANGE_COLOR.b, DIFF_CHANGE_TINT_RATIO); diff.data[index + 3] = 255; continue; } - const context = renderDiffContextPixel(current, index); - diff.data[index] = context.r; - diff.data[index + 1] = context.g; - diff.data[index + 2] = context.b; + const context = renderDiffContextChannel(current, index); + diff.data[index] = context; + diff.data[index + 1] = context; + diff.data[index + 2] = context; diff.data[index + 3] = 255; } @@ -152,7 +152,7 @@ export async function compareScreenshots( const ocr = ocrAnalysis && ocrAnalysis.matches.length > 0 ? toScreenshotOcrSummary(ocrAnalysis) : undefined; const nonTextDeltas = - differentPixels > 0 + differentPixels > 0 && ocrAnalysis ? summarizeNonTextDiffDeltas({ diffMask, width: baseline.width, @@ -200,12 +200,11 @@ function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException return typeof error === 'object' && error !== null && 'code' in error && error.code === code; } -function renderDiffContextPixel(source: PNG, index: number): { r: number; g: number; b: number } { +function renderDiffContextChannel(source: PNG, index: number): number { const gray = Math.round( source.data[index]! * 0.299 + source.data[index + 1]! * 0.587 + source.data[index + 2]! * 0.114, ); - const channel = tintChannel(gray, 255, DIFF_CONTEXT_LIGHTEN_RATIO); - return { r: channel, g: channel, b: channel }; + return tintChannel(gray, 255, DIFF_CONTEXT_LIGHTEN_RATIO); } function tintChannel(base: number, tint: number, ratio: number): number { From 4c784273472cf79b6d0405a7b937c76c09ff7e0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 11:41:37 +0200 Subject: [PATCH 3/8] docs: organize screenshot diff skill guidance --- skills/agent-device/references/verification.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index dc8f7353c..d44941af8 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -8,6 +8,7 @@ Open this file when the task needs evidence, regression checks, replay maintenan - `screenshot` - `diff snapshot` +- `diff screenshot` - `record` - `replay -u` - `perf` @@ -41,14 +42,23 @@ agent-device diff snapshot -i - Run `diff snapshot` to confirm the expected structural change. - Re-run full `snapshot` only when you need fresh refs. -## Visual artifacts +## Screenshot artifacts Use `screenshot` when the proof needs a rendered image instead of a structural tree. - Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot. -- Use `diff screenshot --baseline --out ` when comparing against a saved visual baseline. - - Text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and short descriptions. - - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. + +## Visual regression with diff screenshot + +Use `diff screenshot` when comparing the current rendered screen against a saved visual baseline. + +```bash +agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png +agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs +``` + +- Text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and short descriptions. +- The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. - Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, separators, and card/background movement, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. From 2f17f2fdcd353a5062b9440aadec6663cee3790e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 11:48:13 +0200 Subject: [PATCH 4/8] refactor: prune screenshot diff result metadata --- .../agent-device/references/verification.md | 2 +- src/cli/commands/screenshot.ts | 4 +- src/utils/__tests__/output.test.ts | 38 +-------- .../screenshot-diff-non-text.test.ts | 11 --- .../__tests__/screenshot-diff-ocr.test.ts | 6 -- src/utils/__tests__/screenshot-diff.test.ts | 4 - src/utils/output.ts | 7 +- src/utils/screenshot-diff-non-text.ts | 78 +++++++++---------- src/utils/screenshot-diff-ocr.ts | 36 --------- src/utils/screenshot-diff-overlay-matches.ts | 17 +++- src/utils/screenshot-diff-regions.ts | 29 +------ src/utils/screenshot-diff.ts | 3 +- website/docs/docs/commands.md | 2 +- 13 files changed, 62 insertions(+), 175 deletions(-) diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index d44941af8..2c2c123e6 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -57,7 +57,7 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs ``` -- Text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and short descriptions. +- Text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, and luminance. - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. - Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, separators, and card/background movement, not semantic icon recognition. diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index 9fbbfdd8e..49966dab5 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -80,7 +80,9 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl result = { ...result, currentOverlayPath: overlayResult.path, - ...(overlayResult.overlayRefs ? { currentOverlayRefs: overlayResult.overlayRefs } : {}), + ...(overlayResult.overlayRefs + ? { currentOverlayRefCount: overlayResult.overlayRefs.length } + : {}), ...(result.regions && overlayResult.overlayRefs ? { regions: attachCurrentOverlayMatches(result.regions, overlayResult.overlayRefs), diff --git a/src/utils/__tests__/output.test.ts b/src/utils/__tests__/output.test.ts index 002951768..b3d4b8b77 100644 --- a/src/utils/__tests__/output.test.ts +++ b/src/utils/__tests__/output.test.ts @@ -665,43 +665,28 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' mismatchPercentage: 5, diffPath: '/tmp/test/diff.png', currentOverlayPath: '/tmp/test/diff.current-overlay.png', - currentOverlayRefs: [ - { - ref: 'e1', - label: 'Continue', - rect: { x: 1, y: 2, width: 3, height: 4 }, - overlayRect: { x: 1, y: 2, width: 3, height: 4 }, - center: { x: 3, y: 4 }, - }, - ], + currentOverlayRefCount: 1, regions: [ { index: 1, rect: { x: 10, y: 20, width: 100, height: 40 }, - center: { x: 60, y: 40 }, normalizedRect: { x: 10, y: 20, width: 100, height: 40 }, differentPixels: 350, shareOfDiffPercentage: 70, - imagePercentage: 3.5, densityPercentage: 8.75, shape: 'horizontal-band', size: 'medium', location: 'top-left', - averageBaselineColor: { r: 20, g: 20, b: 20 }, - averageCurrentColor: { r: 220, g: 220, b: 220 }, averageBaselineColorHex: '#141414', averageCurrentColorHex: '#dcdcdc', baselineLuminance: 20, currentLuminance: 220, dominantChange: 'brighter', - description: - "medium region (horizontal-band) in the top-left; 8.75% of this region's pixels differ; current is brighter.", currentOverlayMatches: [ { ref: 'e1', label: 'Continue', rect: { x: 1, y: 2, width: 3, height: 4 }, - overlapPercentage: 100, regionCoveragePercentage: 12, }, ], @@ -716,15 +701,11 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' text: 'Wi-Fi', baselineRect: { x: 120, y: 320, width: 60, height: 22 }, currentRect: { x: 130, y: 332, width: 70, height: 22 }, - baselineNormalizedRect: { x: 12, y: 32, width: 6, height: 2.2 }, - currentNormalizedRect: { x: 13, y: 33.2, width: 7, height: 2.2 }, delta: { x: 10, y: 12, width: 10, height: 0 }, confidence: 94, widthRatio: 1.167, heightRatio: 1, possibleTextMetricMismatch: true, - description: - 'Text "Wi-Fi" moved 10px right, 12px down; text box is 10px wider; possible font, weight, or text rendering mismatch.', }, ], }, @@ -735,17 +716,7 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' slot: 'leading', likelyKind: 'icon', rect: { x: 80, y: 318, width: 30, height: 30 }, - normalizedRect: { x: 8, y: 31.8, width: 3, height: 3 }, - differentPixels: 400, - densityPercentage: 44.44, nearestText: 'Wi-Fi', - nearestTextDistancePx: 45, - evidence: [ - 'residual-diff-outside-ocr', - 'nearest-text="Wi-Fi"', - 'slot=leading', - 'shape=icon', - ], }, ], }), @@ -775,11 +746,8 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| w=1\.167 h=1 \| 94 \| possible-text-metric-mismatch/, ); assert.match(text, /Non-text visual deltas \(showing 1\/1; px\):/); - assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText \| evidence/); - assert.match( - text, - /1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi" \| residual-diff-outside-ocr,nearest-text="Wi-Fi",slot=leading,shape=icon/, - ); + assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText/); + assert.match(text, /1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi"/); assert.equal(text.includes('\x1b['), false); }); diff --git a/src/utils/__tests__/screenshot-diff-non-text.test.ts b/src/utils/__tests__/screenshot-diff-non-text.test.ts index 93441b581..88018dfae 100644 --- a/src/utils/__tests__/screenshot-diff-non-text.test.ts +++ b/src/utils/__tests__/screenshot-diff-non-text.test.ts @@ -29,23 +29,18 @@ test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residua { index: 1, rect: { x: 0, y: 20, width: 180, height: 50 }, - center: { x: 90, y: 45 }, normalizedRect: { x: 0, y: 16.67, width: 81.82, height: 41.67 }, differentPixels: 976, shareOfDiffPercentage: 100, - imagePercentage: 3.7, densityPercentage: 10.84, shape: 'horizontal-band', size: 'medium', location: 'center', - averageBaselineColor: { r: 0, g: 0, b: 0 }, - averageCurrentColor: { r: 255, g: 255, b: 255 }, averageBaselineColorHex: '#000000', averageCurrentColorHex: '#ffffff', baselineLuminance: 0, currentLuminance: 255, dominantChange: 'brighter', - description: 'test region', }, ], ocr: { @@ -71,10 +66,4 @@ test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residua assert.equal(deltas[0]?.likelyKind, 'icon'); assert.deepEqual(deltas[0]?.rect, { x: 20, y: 30, width: 20, height: 20 }); assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); - assert.deepEqual(deltas[0]?.evidence, [ - 'residual-diff-outside-ocr', - 'nearest-text="Wi-Fi"', - 'slot=leading', - 'shape=icon', - ]); }); diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts index 73b720d32..d372b6cc7 100644 --- a/src/utils/__tests__/screenshot-diff-ocr.test.ts +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -66,15 +66,9 @@ test('matchOcrBlocks reports movement and possible text metric mismatch', () => assert.equal(matches.length, 1); assert.deepEqual(matches[0]?.delta, { x: 12, y: -8, width: 10, height: 0 }); - assert.deepEqual(matches[0]?.baselineNormalizedRect, { x: 25, y: 25, width: 12.5, height: 2.5 }); - assert.deepEqual(matches[0]?.currentNormalizedRect, { x: 28, y: 24, width: 15, height: 2.5 }); assert.equal(matches[0]?.widthRatio, 1.2); assert.equal(matches[0]?.heightRatio, 1); assert.equal(matches[0]?.possibleTextMetricMismatch, true); - assert.equal( - matches[0]?.description, - 'Text "Wi-Fi" moved 12px right, 8px up; text box is 10px wider; possible font, weight, or text rendering mismatch.', - ); }); test('summarizeScreenshotOcr returns undefined when tesseract exits non-zero', async () => { diff --git a/src/utils/__tests__/screenshot-diff.test.ts b/src/utils/__tests__/screenshot-diff.test.ts index b4c8240fd..06f4d4680 100644 --- a/src/utils/__tests__/screenshot-diff.test.ts +++ b/src/utils/__tests__/screenshot-diff.test.ts @@ -141,10 +141,6 @@ test('changed pixels are summarized into nearby diff regions', async () => { assert.equal(result.regions?.[0]?.currentLuminance, 255); assert.equal(result.regions?.[0]?.location, 'top-left'); assert.equal(result.regions?.[0]?.dominantChange, 'brighter'); - assert.equal( - result.regions?.[0]?.description, - "large region (horizontal-band) in the top-left; 66.67% of this region's pixels differ; current is brighter.", - ); assert.deepEqual(result.regions?.[1]?.rect, { x: 30, y: 15, width: 4, height: 4 }); const diffPng = PNG.sync.read(fs.readFileSync(diffOut)); diff --git a/src/utils/output.ts b/src/utils/output.ts index 9ead12f9d..a3f6c8c7b 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -232,7 +232,7 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { const relativePath = toRelativePath(data.currentOverlayPath); const label = useColor ? colorize('Current overlay:', 'dim') : 'Current overlay:'; const displayPath = useColor ? colorize(relativePath, 'green') : relativePath; - const refCount = Array.isArray(data.currentOverlayRefs) ? data.currentOverlayRefs.length : 0; + const refCount = toNumber(data.currentOverlayRefCount); const refSuffix = refCount > 0 ? ` (${refCount} refs)` : ''; lines.push(` ${label} ${displayPath}${refSuffix}`); } @@ -300,13 +300,12 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { lines.push( ` Non-text visual deltas (showing ${shownNonTextDeltas.length}/${nonTextDeltas.length}; px):`, ); - lines.push(' item | region | slot | kind | bboxCurrent | nearestText | evidence'); + lines.push(' item | region | slot | kind | bboxCurrent | nearestText'); for (const delta of shownNonTextDeltas) { lines.push( ` ${delta.index} | ${delta.regionIndex ? `r${delta.regionIndex}` : '-'} | ` + `${delta.slot} | ${delta.likelyKind} | ${formatRect(delta.rect)} | ` + - `${delta.nearestText ? JSON.stringify(delta.nearestText) : '-'} | ` + - `${delta.evidence.join(',')}`, + `${delta.nearestText ? JSON.stringify(delta.nearestText) : '-'}`, ); } } diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts index 0fac21d41..6e6d3e473 100644 --- a/src/utils/screenshot-diff-non-text.ts +++ b/src/utils/screenshot-diff-non-text.ts @@ -8,12 +8,7 @@ export type ScreenshotNonTextDelta = { slot: 'leading' | 'trailing' | 'background' | 'separator' | 'unknown'; likelyKind: 'icon' | 'toggle' | 'chevron' | 'separator' | 'card-or-background' | 'visual'; rect: Rect; - normalizedRect: Rect; - differentPixels: number; - densityPercentage: number; nearestText?: string; - nearestTextDistancePx?: number; - evidence: string[]; }; const MAX_NON_TEXT_DELTAS = 12; @@ -46,6 +41,10 @@ type MutableComponent = { differentPixels: number; }; +type ScoredNonTextDelta = Omit & { + score: number; +}; + export function summarizeNonTextDiffDeltas(params: { diffMask: Uint8Array; width: number; @@ -65,9 +64,9 @@ export function summarizeNonTextDiffDeltas(params: { // Status bars and top chrome tend to produce noisy residuals around time, // signal, and battery text; changed regions still report that area. .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) - .sort((left, right) => scoreNonTextDelta(right) - scoreNonTextDelta(left)) + .sort((left, right) => right.score - left.score) .slice(0, Math.max(0, params.maxDeltas ?? MAX_NON_TEXT_DELTAS)) - .map((delta, index) => ({ ...delta, index: index + 1 })) + .map((delta, index) => toPublicNonTextDelta(delta, index + 1)) ); } @@ -169,33 +168,36 @@ function toNonTextDelta( regions: ScreenshotDiffRegion[]; }, textBlocks: ScreenshotOcrBlock[], -): Omit { +): ScoredNonTextDelta { const rect = componentToRect(component); const regionIndex = findContainingRegionIndex(rect, params.regions); const nearestText = findNearestText(rect, textBlocks); const slot = classifySlot(rect, nearestText?.block.rect, params.width); const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels); - const evidence = buildEvidence(slot, likelyKind, nearestText?.block.text); + const scoreParams = { + ...(regionIndex ? { regionIndex } : {}), + slot, + likelyKind, + rect, + }; return { ...(regionIndex ? { regionIndex } : {}), slot, likelyKind, rect, - normalizedRect: { - x: roundPercentage(rect.x / params.width), - y: roundPercentage(rect.y / params.height), - width: roundPercentage(rect.width / params.width), - height: roundPercentage(rect.height / params.height), - }, - differentPixels: component.differentPixels, - densityPercentage: roundPercentage(component.differentPixels / (rect.width * rect.height)), - ...(nearestText - ? { - nearestText: nearestText.block.text, - nearestTextDistancePx: Math.round(nearestText.distance), - } - : {}), - evidence, + ...(nearestText ? { nearestText: nearestText.block.text } : {}), + score: scoreNonTextDelta(scoreParams, component.differentPixels), + }; +} + +function toPublicNonTextDelta(delta: ScoredNonTextDelta, index: number): ScreenshotNonTextDelta { + return { + index, + ...(delta.regionIndex ? { regionIndex: delta.regionIndex } : {}), + slot: delta.slot, + likelyKind: delta.likelyKind, + rect: delta.rect, + ...(delta.nearestText ? { nearestText: delta.nearestText } : {}), }; } @@ -233,7 +235,15 @@ function classifyLikelyKind( return 'visual'; } -function scoreNonTextDelta(delta: Omit): number { +function scoreNonTextDelta( + delta: { + regionIndex?: number; + slot: ScreenshotNonTextDelta['slot']; + likelyKind: ScreenshotNonTextDelta['likelyKind']; + rect: Rect; + }, + differentPixels: number, +): number { const sizePenalty = delta.rect.width >= 300 || delta.rect.height >= 160 ? -35 : 0; const regionScore = delta.regionIndex ? 20 : 0; return ( @@ -241,22 +251,10 @@ function scoreNonTextDelta(delta: Omit): number SLOT_SCORE[delta.slot] + regionScore + sizePenalty + - Math.min(20, delta.differentPixels / 200) + Math.min(20, differentPixels / 200) ); } -function buildEvidence( - slot: ScreenshotNonTextDelta['slot'], - likelyKind: ScreenshotNonTextDelta['likelyKind'], - nearestText: string | undefined, -): string[] { - const evidence = ['residual-diff-outside-ocr']; - if (nearestText) evidence.push(`nearest-text=${JSON.stringify(nearestText)}`); - if (slot !== 'unknown') evidence.push(`slot=${slot}`); - evidence.push(`shape=${likelyKind}`); - return evidence; -} - function findContainingRegionIndex( rect: Rect, regions: ScreenshotDiffRegion[], @@ -362,7 +360,3 @@ function squaredDistance(left: { x: number; y: number }, right: { x: number; y: function clamp(value: number, min: number, max: number): number { return Math.min(Math.max(value, min), max); } - -function roundPercentage(ratio: number): number { - return Math.round(ratio * 100 * 100) / 100; -} diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts index b1aa78af2..567245f24 100644 --- a/src/utils/screenshot-diff-ocr.ts +++ b/src/utils/screenshot-diff-ocr.ts @@ -12,14 +12,11 @@ export type ScreenshotOcrTextMatch = { text: string; baselineRect: Rect; currentRect: Rect; - baselineNormalizedRect: Rect; - currentNormalizedRect: Rect; delta: { x: number; y: number; width: number; height: number }; confidence: number; widthRatio: number; heightRatio: number; possibleTextMetricMismatch: boolean; - description: string; }; export type ScreenshotOcrSummary = { @@ -266,47 +263,14 @@ function toOcrTextMatch( text: baselineBlock.text, baselineRect: baselineBlock.rect, currentRect: currentBlock.rect, - baselineNormalizedRect: baselineBlock.normalizedRect, - currentNormalizedRect: currentBlock.normalizedRect, delta, confidence: Math.round(Math.min(baselineBlock.confidence, currentBlock.confidence) * 100) / 100, widthRatio, heightRatio, possibleTextMetricMismatch, - description: describeOcrMatchDelta(baselineBlock.text, delta, possibleTextMetricMismatch), }; } -function describeOcrMatchDelta( - text: string, - delta: ScreenshotOcrTextMatch['delta'], - possibleTextMetricMismatch: boolean, -): string { - const movement = [ - describePixelDelta(delta.x, 'right', 'left'), - describePixelDelta(delta.y, 'down', 'up'), - ].filter((entry): entry is string => entry !== null); - const size = [ - describePixelDelta(delta.width, 'wider', 'narrower'), - describePixelDelta(delta.height, 'taller', 'shorter'), - ].filter((entry): entry is string => entry !== null); - const parts = [ - movement.length > 0 ? `moved ${movement.join(', ')}` : null, - size.length > 0 ? `text box is ${size.join(', ')}` : null, - possibleTextMetricMismatch ? 'possible font, weight, or text rendering mismatch' : null, - ].filter((entry): entry is string => entry !== null); - return `Text "${text}" ${parts.join('; ')}.`; -} - -function describePixelDelta( - value: number, - positiveLabel: string, - negativeLabel: string, -): string | null { - if (Math.abs(value) < MIN_MEANINGFUL_DELTA_PX) return null; - return `${Math.abs(Math.round(value))}px ${value > 0 ? positiveLabel : negativeLabel}`; -} - function hasMeaningfulOcrDelta(match: ScreenshotOcrTextMatch): boolean { return ( Math.abs(match.delta.x) >= MIN_MEANINGFUL_DELTA_PX || diff --git a/src/utils/screenshot-diff-overlay-matches.ts b/src/utils/screenshot-diff-overlay-matches.ts index 46f75fa86..161c44c98 100644 --- a/src/utils/screenshot-diff-overlay-matches.ts +++ b/src/utils/screenshot-diff-overlay-matches.ts @@ -30,17 +30,26 @@ function findRegionOverlayMatches( ref: overlayRef.ref, ...(overlayRef.label ? { label: overlayRef.label } : {}), rect: overlayRect, - overlapPercentage: roundPercentage(overlapArea / rectArea(overlayRect)), + overlayCoveragePercentage: roundPercentage(overlapArea / rectArea(overlayRect)), regionCoveragePercentage: roundPercentage(overlapArea / regionArea), }; }) - .filter((match): match is ScreenshotDiffRegionOverlayMatch => match !== null) + .filter( + (match): match is ScreenshotDiffRegionOverlayMatch & { overlayCoveragePercentage: number } => + match !== null, + ) .sort((left, right) => { const coverageDelta = right.regionCoveragePercentage - left.regionCoveragePercentage; if (coverageDelta !== 0) return coverageDelta; - return right.overlapPercentage - left.overlapPercentage; + return right.overlayCoveragePercentage - left.overlayCoveragePercentage; }) - .slice(0, MAX_MATCHES_PER_REGION); + .slice(0, MAX_MATCHES_PER_REGION) + .map((match) => ({ + ref: match.ref, + ...(match.label ? { label: match.label } : {}), + rect: match.rect, + regionCoveragePercentage: match.regionCoveragePercentage, + })); } function intersectArea(left: Rect, right: Rect): number { diff --git a/src/utils/screenshot-diff-regions.ts b/src/utils/screenshot-diff-regions.ts index 0d5c049a9..8d4f2c6ad 100644 --- a/src/utils/screenshot-diff-regions.ts +++ b/src/utils/screenshot-diff-regions.ts @@ -1,7 +1,7 @@ import { PNG } from 'pngjs'; import { splitLargeDiffRegions } from './screenshot-diff-region-split.ts'; -export type ScreenshotDiffColor = { +type ScreenshotDiffColor = { r: number; g: number; b: number; @@ -10,30 +10,24 @@ export type ScreenshotDiffColor = { export type ScreenshotDiffRegion = { index: number; rect: { x: number; y: number; width: number; height: number }; - center: { x: number; y: number }; normalizedRect: { x: number; y: number; width: number; height: number }; differentPixels: number; shareOfDiffPercentage: number; - imagePercentage: number; densityPercentage: number; shape: 'compact' | 'horizontal-band' | 'vertical-band' | 'large-area'; size: 'small' | 'medium' | 'large'; location: string; - averageBaselineColor: ScreenshotDiffColor; - averageCurrentColor: ScreenshotDiffColor; averageBaselineColorHex: string; averageCurrentColorHex: string; baselineLuminance: number; currentLuminance: number; dominantChange: 'brighter' | 'darker' | 'color-shift' | 'mixed'; - description: string; currentOverlayMatches?: ScreenshotDiffRegionOverlayMatch[]; }; export type ScreenshotDiffRegionOverlayMatch = { ref: string; label?: string; - overlapPercentage: number; regionCoveragePercentage: number; rect: { x: number; y: number; width: number; height: number }; }; @@ -254,7 +248,6 @@ function toScreenshotDiffRegion( return { index, rect, - center, normalizedRect: { x: roundPercentage(rect.x / image.width), y: roundPercentage(rect.y / image.height), @@ -263,22 +256,15 @@ function toScreenshotDiffRegion( }, differentPixels: region.differentPixels, shareOfDiffPercentage: roundPercentage(region.differentPixels / image.differentPixels), - imagePercentage: roundPercentage(region.differentPixels / image.totalPixels), densityPercentage, shape, size, location, - averageBaselineColor, - averageCurrentColor, averageBaselineColorHex: toHexColor(averageBaselineColor), averageCurrentColorHex: toHexColor(averageCurrentColor), baselineLuminance, currentLuminance, dominantChange, - description: - `${size} region (${shape}) in the ${location}; ` + - `${densityPercentage}% of this region's pixels differ; ` + - `current is ${formatDominantChange(dominantChange)}.`, }; } @@ -342,19 +328,6 @@ function describeRegionSize(regionArea: number, totalPixels: number): Screenshot return 'small'; } -function formatDominantChange(change: ScreenshotDiffRegion['dominantChange']): string { - switch (change) { - case 'brighter': - return 'brighter'; - case 'darker': - return 'darker'; - case 'color-shift': - return 'color-shifted'; - default: - return 'mixed'; - } -} - function luminance(color: ScreenshotDiffColor): number { return color.r * 0.2126 + color.g * 0.7152 + color.b * 0.0722; } diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index e86b3d313..2ca7b6ee4 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -14,7 +14,6 @@ import { type ScreenshotOcrSummary, } from './screenshot-diff-ocr.ts'; import { summarizeDiffRegions, type ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; -import type { ScreenshotOverlayRef } from './snapshot.ts'; export type ScreenshotDimensionMismatch = { expected: { width: number; height: number }; @@ -30,7 +29,7 @@ export type ScreenshotDiffResult = { dimensionMismatch?: ScreenshotDimensionMismatch; regions?: ScreenshotDiffRegion[]; currentOverlayPath?: string; - currentOverlayRefs?: ScreenshotOverlayRef[]; + currentOverlayRefCount?: number; ocr?: ScreenshotOcrSummary; nonTextDeltas?: ScreenshotNonTextDelta[]; }; diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index cd46f9b92..feca1e5f4 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -553,7 +553,7 @@ agent-device record stop # Stop active recording - Recordings always produce a video artifact. When touch visualization is enabled, they also produce a gesture telemetry sidecar that can be used for post-processing or inspection. - `screenshot --overlay-refs` captures a fresh full snapshot and burns visible `@eN` refs plus their target rectangles into the saved PNG. -- `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, luminance, and short descriptions, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. +- `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. - If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, separators, and card/background movement, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. From 16849244c39e3794162f56e00e4656443049a5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 11:55:56 +0200 Subject: [PATCH 5/8] refactor: tighten screenshot diff readout --- .../agent-device/references/verification.md | 4 +-- src/__tests__/cli-diff.test.ts | 2 +- src/utils/__tests__/output.test.ts | 10 +++--- .../screenshot-diff-non-text.test.ts | 33 +++++++++++++++++++ .../__tests__/screenshot-diff-ocr.test.ts | 2 -- src/utils/output.ts | 24 ++------------ src/utils/screenshot-diff-non-text.ts | 31 ++++++++++++----- src/utils/screenshot-diff-ocr.ts | 4 --- website/docs/docs/commands.md | 4 +-- 9 files changed, 67 insertions(+), 47 deletions(-) diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 2c2c123e6..c6c9066d6 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -57,10 +57,10 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs ``` -- Text and JSON output include ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, and luminance. +- Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. - Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. -- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, separators, and card/background movement, not semantic icon recognition. +- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. ## Session recording diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index b3b775709..5dc33ea94 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -378,7 +378,7 @@ describe('cli diff commands', () => { assert.match(result.stdout, /diff\.current-overlay\.png \(1 refs\)/); assert.match( result.stdout, - /size=large shape=large-area density=100% boundsPct=0,0,100,100 avgColor=#000000->#ffffff luminance=0->255/, + /size=large shape=large-area density=100% avgColor=#000000->#ffffff luminance=0->255/, ); assert.match(result.stdout, /overlaps @e1 "Continue", 12% of region/); } finally { diff --git a/src/utils/__tests__/output.test.ts b/src/utils/__tests__/output.test.ts index b3d4b8b77..a073a0f66 100644 --- a/src/utils/__tests__/output.test.ts +++ b/src/utils/__tests__/output.test.ts @@ -703,8 +703,6 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' currentRect: { x: 130, y: 332, width: 70, height: 22 }, delta: { x: 10, y: 12, width: 10, height: 0 }, confidence: 94, - widthRatio: 1.167, - heightRatio: 1, possibleTextMetricMismatch: true, }, ], @@ -727,10 +725,10 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' assert.match(text, /diff\.current-overlay\.png \(1 refs\)/); assert.match(text, /500 different \/ 10000 total pixels/); assert.match(text, /Changed regions:/); - assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, current is brighter/); + assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, change=brighter/); assert.match( text, - /size=medium shape=horizontal-band density=8\.75% boundsPct=10,20,100,40 avgColor=#141414->#dcdcdc luminance=20->220/, + /size=medium shape=horizontal-band density=8\.75% avgColor=#141414->#dcdcdc luminance=20->220/, ); assert.match(text, /overlaps @e1 "Continue", 12% of region/); assert.match( @@ -739,11 +737,11 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' ); assert.match( text, - /item \| text \| movePx \| sizeDeltaPx \| bboxBaseline \| bboxCurrent \| textRatio \| confidence \| issueHint/, + /item \| text \| movePx \| sizeDeltaPx \| bboxBaseline \| bboxCurrent \| confidence \| issueHint/, ); assert.match( text, - /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| w=1\.167 h=1 \| 94 \| possible-text-metric-mismatch/, + /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| 94 \| possible-text-metric-mismatch/, ); assert.match(text, /Non-text visual deltas \(showing 1\/1; px\):/); assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText/); diff --git a/src/utils/__tests__/screenshot-diff-non-text.test.ts b/src/utils/__tests__/screenshot-diff-non-text.test.ts index 88018dfae..b644922f1 100644 --- a/src/utils/__tests__/screenshot-diff-non-text.test.ts +++ b/src/utils/__tests__/screenshot-diff-non-text.test.ts @@ -67,3 +67,36 @@ test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residua assert.deepEqual(deltas[0]?.rect, { x: 20, y: 30, width: 20, height: 20 }); assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); }); + +test('summarizeNonTextDiffDeltas omits broad background residuals', () => { + const width = 220; + const height = 120; + const diffMask = new Uint8Array(width * height); + paintMaskRect(diffMask, width, { x: 10, y: 30, width: 180, height: 40 }); + + const deltas = summarizeNonTextDiffDeltas({ + diffMask, + width, + height, + regions: [ + { + index: 1, + rect: { x: 10, y: 30, width: 180, height: 40 }, + normalizedRect: { x: 4.55, y: 25, width: 81.82, height: 33.33 }, + differentPixels: 7200, + shareOfDiffPercentage: 100, + densityPercentage: 100, + shape: 'large-area', + size: 'large', + location: 'center', + averageBaselineColorHex: '#000000', + averageCurrentColorHex: '#ffffff', + baselineLuminance: 0, + currentLuminance: 255, + dominantChange: 'brighter', + }, + ], + }); + + assert.deepEqual(deltas, []); +}); diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts index d372b6cc7..5c0a56fb0 100644 --- a/src/utils/__tests__/screenshot-diff-ocr.test.ts +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -66,8 +66,6 @@ test('matchOcrBlocks reports movement and possible text metric mismatch', () => assert.equal(matches.length, 1); assert.deepEqual(matches[0]?.delta, { x: 12, y: -8, width: 10, height: 0 }); - assert.equal(matches[0]?.widthRatio, 1.2); - assert.equal(matches[0]?.heightRatio, 1); assert.equal(matches[0]?.possibleTextMetricMismatch, true); }); diff --git a/src/utils/output.ts b/src/utils/output.ts index a3f6c8c7b..35383439a 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -253,8 +253,7 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { const rect = region.rect; lines.push( ` ${region.index}. ${region.location} x=${rect.x} y=${rect.y} ` + - `${rect.width}x${rect.height}, ${share}% of diff, ` + - formatDominantScreenshotChange(region.dominantChange), + `${rect.width}x${rect.height}, ${share}% of diff, change=${region.dominantChange}`, ); const detailLine = formatScreenshotRegionDetails(region); if (detailLine) { @@ -279,7 +278,7 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { `currentBlocks=${data.ocr?.currentBlocks}; showing ${shownOcrMatches.length}/${ocrMatches.length}; px):`, ); lines.push( - ' item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | textRatio | confidence | issueHint', + ' item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | confidence | issueHint', ); for (const [index, ocrMatch] of shownOcrMatches.entries()) { const delta = ocrMatch.delta; @@ -288,7 +287,7 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { `${formatSignedPixels(delta.x)},${formatSignedPixels(delta.y)} | ` + `${formatSignedPixels(delta.width)},${formatSignedPixels(delta.height)} | ` + `${formatRect(ocrMatch.baselineRect)} | ${formatRect(ocrMatch.currentRect)} | ` + - `w=${ocrMatch.widthRatio} h=${ocrMatch.heightRatio} | ${ocrMatch.confidence} | ` + + `${ocrMatch.confidence} | ` + `${ocrMatch.possibleTextMetricMismatch ? 'possible-text-metric-mismatch' : '-'}`, ); } @@ -321,28 +320,11 @@ function formatSignedPixels(value: number): string { return value > 0 ? `+${value}` : String(value); } -function formatDominantScreenshotChange(change: string | undefined): string { - switch (change) { - case 'brighter': - return 'current is brighter'; - case 'darker': - return 'current is darker'; - case 'color-shift': - return 'color shifted'; - default: - return 'mixed change'; - } -} - function formatScreenshotRegionDetails(region: ScreenshotDiffRegion): string | null { - const normalizedRect = region.normalizedRect; const details = [ region.size ? `size=${region.size}` : null, region.shape ? `shape=${region.shape}` : null, typeof region.densityPercentage === 'number' ? `density=${region.densityPercentage}%` : null, - normalizedRect - ? `boundsPct=${normalizedRect.x},${normalizedRect.y},${normalizedRect.width},${normalizedRect.height}` - : null, region.averageBaselineColorHex && region.averageCurrentColorHex ? `avgColor=${region.averageBaselineColorHex}->${region.averageCurrentColorHex}` : null, diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts index 6e6d3e473..6a9ed5873 100644 --- a/src/utils/screenshot-diff-non-text.ts +++ b/src/utils/screenshot-diff-non-text.ts @@ -6,11 +6,13 @@ export type ScreenshotNonTextDelta = { index: number; regionIndex?: number; slot: 'leading' | 'trailing' | 'background' | 'separator' | 'unknown'; - likelyKind: 'icon' | 'toggle' | 'chevron' | 'separator' | 'card-or-background' | 'visual'; + likelyKind: 'icon' | 'toggle' | 'chevron' | 'separator' | 'visual'; rect: Rect; nearestText?: string; }; +type NonTextKind = ScreenshotNonTextDelta['likelyKind'] | 'background'; + const MAX_NON_TEXT_DELTAS = 12; const OCR_MASK_PADDING_PX = 8; const MIN_COMPONENT_PIXELS = 24; @@ -23,8 +25,8 @@ const KIND_SCORE = { chevron: 75, separator: 45, visual: 35, - 'card-or-background': 10, -} satisfies Record; + background: 10, +} satisfies Record; const SLOT_SCORE = { leading: 20, trailing: 20, @@ -41,7 +43,8 @@ type MutableComponent = { differentPixels: number; }; -type ScoredNonTextDelta = Omit & { +type ScoredNonTextDelta = Omit & { + likelyKind: NonTextKind; score: number; }; @@ -64,6 +67,7 @@ export function summarizeNonTextDiffDeltas(params: { // Status bars and top chrome tend to produce noisy residuals around time, // signal, and battery text; changed regions still report that area. .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) + .filter(hasAgentFacingKind) .sort((left, right) => right.score - left.score) .slice(0, Math.max(0, params.maxDeltas ?? MAX_NON_TEXT_DELTAS)) .map((delta, index) => toPublicNonTextDelta(delta, index + 1)) @@ -190,7 +194,10 @@ function toNonTextDelta( }; } -function toPublicNonTextDelta(delta: ScoredNonTextDelta, index: number): ScreenshotNonTextDelta { +function toPublicNonTextDelta( + delta: ScoredNonTextDelta & { likelyKind: ScreenshotNonTextDelta['likelyKind'] }, + index: number, +): ScreenshotNonTextDelta { return { index, ...(delta.regionIndex ? { regionIndex: delta.regionIndex } : {}), @@ -223,23 +230,29 @@ function classifyLikelyKind( rect: Rect, slot: ScreenshotNonTextDelta['slot'], differentPixels: number, -): ScreenshotNonTextDelta['likelyKind'] { +): NonTextKind { const aspect = rect.width / rect.height; const density = differentPixels / (rect.width * rect.height); if (slot === 'separator') return 'separator'; - if (slot === 'background') return 'card-or-background'; + if (slot === 'background') return 'background'; if (slot === 'trailing' && aspect >= 1.5 && aspect <= 3.8 && density >= 0.35) return 'toggle'; if (slot === 'trailing' && rect.width <= 44 && rect.height <= 64) return 'chevron'; if (slot === 'leading' && aspect >= 0.55 && aspect <= 1.8) return 'icon'; - if (rect.width >= 300 || rect.height >= 160) return 'card-or-background'; + if (rect.width >= 300 || rect.height >= 160) return 'background'; return 'visual'; } +function hasAgentFacingKind( + delta: ScoredNonTextDelta, +): delta is ScoredNonTextDelta & { likelyKind: ScreenshotNonTextDelta['likelyKind'] } { + return delta.likelyKind !== 'background'; +} + function scoreNonTextDelta( delta: { regionIndex?: number; slot: ScreenshotNonTextDelta['slot']; - likelyKind: ScreenshotNonTextDelta['likelyKind']; + likelyKind: NonTextKind; rect: Rect; }, differentPixels: number, diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts index 567245f24..946fbb65a 100644 --- a/src/utils/screenshot-diff-ocr.ts +++ b/src/utils/screenshot-diff-ocr.ts @@ -14,8 +14,6 @@ export type ScreenshotOcrTextMatch = { currentRect: Rect; delta: { x: number; y: number; width: number; height: number }; confidence: number; - widthRatio: number; - heightRatio: number; possibleTextMetricMismatch: boolean; }; @@ -265,8 +263,6 @@ function toOcrTextMatch( currentRect: currentBlock.rect, delta, confidence: Math.round(Math.min(baselineBlock.confidence, currentBlock.confidence) * 100) / 100, - widthRatio, - heightRatio, possibleTextMetricMismatch, }; } diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index feca1e5f4..f1fc44efc 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -553,9 +553,9 @@ agent-device record stop # Stop active recording - Recordings always produce a video artifact. When touch visualization is enabled, they also produce a gesture telemetry sidecar that can be used for post-processing or inspection. - `screenshot --overlay-refs` captures a fresh full snapshot and burns visible `@eN` refs plus their target rectangles into the saved PNG. -- `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, normalized bounds, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. +- `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. JSON also includes normalized bounds. - If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. -- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, separators, and card/background movement, not semantic icon recognition. +- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `. - Burned-in touch overlays are exported only on macOS hosts, because the overlay pipeline depends on Swift + AVFoundation helpers. From 7490a6c6233d46addb85119fde84987ca9d01636 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 12:00:34 +0200 Subject: [PATCH 6/8] refactor: generalize screenshot diff heuristics --- src/utils/screenshot-diff-non-text.ts | 74 +++++++++++++++++++---- src/utils/screenshot-diff-ocr.ts | 2 + src/utils/screenshot-diff-region-split.ts | 50 +++++++++++---- src/utils/screenshot-diff-regions.ts | 34 ++++++++--- 4 files changed, 128 insertions(+), 32 deletions(-) diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts index 6a9ed5873..39afef1b6 100644 --- a/src/utils/screenshot-diff-non-text.ts +++ b/src/utils/screenshot-diff-non-text.ts @@ -19,6 +19,25 @@ const MIN_COMPONENT_PIXELS = 24; const MIN_COMPONENT_SIDE = 3; const MERGE_GAP_PX = 10; const MIN_CONTENT_Y_RATIO = 0.08; +// Non-text hints classify residual geometry relative to the screenshot size. +// Aspect/density checks describe common UI glyph shapes rather than app-specific elements. +const SEPARATOR_MAX_THICKNESS_PX = 3; +const SEPARATOR_MIN_WIDTH_RATIO = 0.12; +const BACKGROUND_SLOT_WIDTH_RATIO = 0.4; +const UNKNOWN_BACKGROUND_SLOT_WIDTH_RATIO = 0.35; +const LARGE_RESIDUAL_WIDTH_RATIO = 0.25; +const LARGE_RESIDUAL_HEIGHT_RATIO = 0.06; +const TOGGLE_MIN_ASPECT_RATIO = 1.5; +const TOGGLE_MAX_ASPECT_RATIO = 3.8; +const TOGGLE_MIN_DENSITY_RATIO = 0.35; +const CHEVRON_MAX_WIDTH_RATIO = 0.06; +const CHEVRON_MAX_HEIGHT_RATIO = 0.04; +const ICON_MIN_ASPECT_RATIO = 0.55; +const ICON_MAX_ASPECT_RATIO = 1.8; +const LARGE_RESIDUAL_SCORE_PENALTY = -35; +const REGION_OVERLAP_SCORE = 20; +const MAX_PIXEL_COUNT_SCORE = 20; +const PIXELS_PER_SCORE_POINT = 200; const KIND_SCORE = { icon: 90, toggle: 90, @@ -177,7 +196,7 @@ function toNonTextDelta( const regionIndex = findContainingRegionIndex(rect, params.regions); const nearestText = findNearestText(rect, textBlocks); const slot = classifySlot(rect, nearestText?.block.rect, params.width); - const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels); + const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels, params); const scoreParams = { ...(regionIndex ? { regionIndex } : {}), slot, @@ -190,7 +209,7 @@ function toNonTextDelta( likelyKind, rect, ...(nearestText ? { nearestText: nearestText.block.text } : {}), - score: scoreNonTextDelta(scoreParams, component.differentPixels), + score: scoreNonTextDelta(scoreParams, component.differentPixels, params), }; } @@ -213,32 +232,53 @@ function classifySlot( nearestTextRect: Rect | undefined, imageWidth: number, ): ScreenshotNonTextDelta['slot'] { - if (rect.height <= 3 && rect.width >= 60) return 'separator'; + if ( + rect.height <= SEPARATOR_MAX_THICKNESS_PX && + rect.width >= imageWidth * SEPARATOR_MIN_WIDTH_RATIO + ) { + return 'separator'; + } if (!nearestTextRect) { - if (rect.width >= imageWidth * 0.4) return 'background'; + if (rect.width >= imageWidth * BACKGROUND_SLOT_WIDTH_RATIO) return 'background'; return 'unknown'; } - if (rect.width >= imageWidth * 0.4) return 'background'; + if (rect.width >= imageWidth * BACKGROUND_SLOT_WIDTH_RATIO) return 'background'; const rectCenterX = rect.x + rect.width / 2; const textCenterX = nearestTextRect.x + nearestTextRect.width / 2; if (rectCenterX < textCenterX - nearestTextRect.width / 2) return 'leading'; if (rectCenterX > textCenterX + nearestTextRect.width / 2) return 'trailing'; - return rect.width >= imageWidth * 0.35 ? 'background' : 'unknown'; + return rect.width >= imageWidth * UNKNOWN_BACKGROUND_SLOT_WIDTH_RATIO ? 'background' : 'unknown'; } function classifyLikelyKind( rect: Rect, slot: ScreenshotNonTextDelta['slot'], differentPixels: number, + image: { width: number; height: number }, ): NonTextKind { const aspect = rect.width / rect.height; const density = differentPixels / (rect.width * rect.height); if (slot === 'separator') return 'separator'; if (slot === 'background') return 'background'; - if (slot === 'trailing' && aspect >= 1.5 && aspect <= 3.8 && density >= 0.35) return 'toggle'; - if (slot === 'trailing' && rect.width <= 44 && rect.height <= 64) return 'chevron'; - if (slot === 'leading' && aspect >= 0.55 && aspect <= 1.8) return 'icon'; - if (rect.width >= 300 || rect.height >= 160) return 'background'; + if ( + slot === 'trailing' && + aspect >= TOGGLE_MIN_ASPECT_RATIO && + aspect <= TOGGLE_MAX_ASPECT_RATIO && + density >= TOGGLE_MIN_DENSITY_RATIO + ) { + return 'toggle'; + } + if ( + slot === 'trailing' && + rect.width <= image.width * CHEVRON_MAX_WIDTH_RATIO && + rect.height <= image.height * CHEVRON_MAX_HEIGHT_RATIO + ) { + return 'chevron'; + } + if (slot === 'leading' && aspect >= ICON_MIN_ASPECT_RATIO && aspect <= ICON_MAX_ASPECT_RATIO) { + return 'icon'; + } + if (isLargeResidual(rect, image)) return 'background'; return 'visual'; } @@ -256,15 +296,23 @@ function scoreNonTextDelta( rect: Rect; }, differentPixels: number, + image: { width: number; height: number }, ): number { - const sizePenalty = delta.rect.width >= 300 || delta.rect.height >= 160 ? -35 : 0; - const regionScore = delta.regionIndex ? 20 : 0; + const sizePenalty = isLargeResidual(delta.rect, image) ? LARGE_RESIDUAL_SCORE_PENALTY : 0; + const regionScore = delta.regionIndex ? REGION_OVERLAP_SCORE : 0; return ( KIND_SCORE[delta.likelyKind] + SLOT_SCORE[delta.slot] + regionScore + sizePenalty + - Math.min(20, differentPixels / 200) + Math.min(MAX_PIXEL_COUNT_SCORE, differentPixels / PIXELS_PER_SCORE_POINT) + ); +} + +function isLargeResidual(rect: Rect, image: { width: number; height: number }): boolean { + return ( + rect.width >= image.width * LARGE_RESIDUAL_WIDTH_RATIO || + rect.height >= image.height * LARGE_RESIDUAL_HEIGHT_RATIO ); } diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts index 946fbb65a..9e9f011d3 100644 --- a/src/utils/screenshot-diff-ocr.ts +++ b/src/utils/screenshot-diff-ocr.ts @@ -38,6 +38,8 @@ type TesseractWord = { const OCR_TIMEOUT_MS = 10_000; const MAX_OCR_MATCHES = 12; +// OCR text matching uses small generic movement/shape thresholds; the fixed gap +// is only a floor before falling back to word-height-relative spacing. const MIN_MEANINGFUL_DELTA_PX = 2; const MIN_SEGMENT_GAP_PX = 48; const TEXT_WIDTH_MISMATCH_RATIO = 0.08; diff --git a/src/utils/screenshot-diff-region-split.ts b/src/utils/screenshot-diff-region-split.ts index 3b53355c5..a84c44a46 100644 --- a/src/utils/screenshot-diff-region-split.ts +++ b/src/utils/screenshot-diff-region-split.ts @@ -1,9 +1,13 @@ import { PNG } from 'pngjs'; import type { MutableDiffRegion } from './screenshot-diff-regions.ts'; -const MIN_SPLIT_REGION_HEIGHT = 180; +// Region splitting is based on screen-relative heights so it works on phone, +// tablet, and desktop screenshots; the pixel floors only suppress tiny fixtures/noise. +const MIN_SPLIT_REGION_HEIGHT_RATIO = 0.07; +const MIN_SPLIT_REGION_HEIGHT_FLOOR_PX = 48; const MIN_SPLIT_REGION_WIDTH_RATIO = 0.35; -const MIN_SPLIT_SEGMENT_HEIGHT = 80; +const MIN_SPLIT_SEGMENT_HEIGHT_RATIO = 0.03; +const MIN_SPLIT_SEGMENT_HEIGHT_FLOOR_PX = 24; const LOW_DENSITY_RATIO = 0.08; const MIN_LOW_DENSITY_BAND_HEIGHT = 6; const ROW_SMOOTHING_RADIUS = 3; @@ -13,21 +17,33 @@ export function splitLargeDiffRegions( params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, ): MutableDiffRegion[] { return regions.flatMap((region) => - shouldSplitRegion(region, params.baseline.width) - ? splitRegionByHorizontalDensity(region, params) + shouldSplitRegion(region, params.baseline.width, params.baseline.height) + ? splitRegionByHorizontalDensity( + region, + params, + minSplitSegmentHeight(params.baseline.height), + ) : [region], ); } -function shouldSplitRegion(region: MutableDiffRegion, imageWidth: number): boolean { +function shouldSplitRegion( + region: MutableDiffRegion, + imageWidth: number, + imageHeight: number, +): boolean { const width = region.maxX - region.minX + 1; const height = region.maxY - region.minY + 1; - return height >= MIN_SPLIT_REGION_HEIGHT && width >= imageWidth * MIN_SPLIT_REGION_WIDTH_RATIO; + return ( + height >= minSplitRegionHeight(imageHeight) && + width >= imageWidth * MIN_SPLIT_REGION_WIDTH_RATIO + ); } function splitRegionByHorizontalDensity( region: MutableDiffRegion, params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, + minSegmentHeight: number, ): MutableDiffRegion[] { const rowCounts = measureRowDiffCounts(region, params.diffMask, params.baseline.width); const smoothed = smoothCounts(rowCounts); @@ -35,7 +51,7 @@ function splitRegionByHorizontalDensity( smoothed, Math.max(1, Math.round((region.maxX - region.minX + 1) * LOW_DENSITY_RATIO)), ); - const ranges = buildSegmentRanges(region, lowDensityBands); + const ranges = buildSegmentRanges(region, lowDensityBands, minSegmentHeight); if (ranges.length <= 1) return [region]; const splitRegions = ranges @@ -96,15 +112,13 @@ function findLowDensityBands(counts: number[], threshold: number): Array<[number function buildSegmentRanges( region: MutableDiffRegion, lowDensityBands: Array<[number, number]>, + minSegmentHeight: number, ): Array<[number, number]> { const ranges: Array<[number, number]> = []; let segmentStart = region.minY; for (const [relativeStart, relativeEnd] of lowDensityBands) { const cutY = region.minY + Math.round((relativeStart + relativeEnd) / 2); - if ( - cutY - segmentStart + 1 < MIN_SPLIT_SEGMENT_HEIGHT || - region.maxY - cutY < MIN_SPLIT_SEGMENT_HEIGHT - ) { + if (cutY - segmentStart + 1 < minSegmentHeight || region.maxY - cutY < minSegmentHeight) { continue; } ranges.push([segmentStart, cutY]); @@ -114,6 +128,20 @@ function buildSegmentRanges( return ranges; } +function minSplitRegionHeight(imageHeight: number): number { + return Math.max( + MIN_SPLIT_REGION_HEIGHT_FLOOR_PX, + Math.round(imageHeight * MIN_SPLIT_REGION_HEIGHT_RATIO), + ); +} + +function minSplitSegmentHeight(imageHeight: number): number { + return Math.max( + MIN_SPLIT_SEGMENT_HEIGHT_FLOOR_PX, + Math.round(imageHeight * MIN_SPLIT_SEGMENT_HEIGHT_RATIO), + ); +} + function buildRegionSlice( region: MutableDiffRegion, minY: number, diff --git a/src/utils/screenshot-diff-regions.ts b/src/utils/screenshot-diff-regions.ts index 8d4f2c6ad..fd736b478 100644 --- a/src/utils/screenshot-diff-regions.ts +++ b/src/utils/screenshot-diff-regions.ts @@ -34,6 +34,15 @@ export type ScreenshotDiffRegionOverlayMatch = { const DEFAULT_MAX_DIFF_REGIONS = 8; const REGION_MERGE_GAP_PX = 12; +const MAX_REGIONS_TO_MERGE = 2000; +// These region labels are coarse, screen-relative buckets for agent guidance, +// not tuned to a specific screenshot size or app layout. +const DOMINANT_CHANGE_MIN_CHANNEL_DELTA = 12; +const LARGE_AREA_MIN_WIDTH_RATIO = 0.55; +const LARGE_AREA_MIN_HEIGHT_RATIO = 0.12; +const BAND_MIN_ASPECT_RATIO = 2.5; +const LARGE_REGION_MIN_AREA_RATIO = 0.04; +const MEDIUM_REGION_MIN_AREA_RATIO = 0.01; export type MutableDiffRegion = { minX: number; @@ -61,7 +70,9 @@ export function summarizeDiffRegions(params: { // Avoid quadratic nearby-merge work on extremely noisy diffs; the later ranking // still keeps the largest components, but tiny speckles may remain unmerged. const mergedRegions = - rawRegions.length <= 2000 ? mergeNearbyRegions(rawRegions, REGION_MERGE_GAP_PX) : rawRegions; + rawRegions.length <= MAX_REGIONS_TO_MERGE + ? mergeNearbyRegions(rawRegions, REGION_MERGE_GAP_PX) + : rawRegions; const splitRegions = splitLargeDiffRegions(mergedRegions, params); return splitRegions .sort((left, right) => { @@ -300,14 +311,16 @@ function describeDominantChange( const baselineLuminance = luminance(baseline); const currentLuminance = luminance(current); const luminanceDelta = currentLuminance - baselineLuminance; - if (Math.abs(luminanceDelta) >= 12) return luminanceDelta > 0 ? 'brighter' : 'darker'; + if (Math.abs(luminanceDelta) >= DOMINANT_CHANGE_MIN_CHANNEL_DELTA) { + return luminanceDelta > 0 ? 'brighter' : 'darker'; + } const maxChannelDelta = Math.max( Math.abs(current.r - baseline.r), Math.abs(current.g - baseline.g), Math.abs(current.b - baseline.b), ); - return maxChannelDelta >= 12 ? 'color-shift' : 'mixed'; + return maxChannelDelta >= DOMINANT_CHANGE_MIN_CHANNEL_DELTA ? 'color-shift' : 'mixed'; } function describeRegionShape( @@ -315,16 +328,21 @@ function describeRegionShape( imageWidth: number, imageHeight: number, ): ScreenshotDiffRegion['shape'] { - if (rect.width >= imageWidth * 0.55 && rect.height >= imageHeight * 0.12) return 'large-area'; - if (rect.width >= rect.height * 2.5) return 'horizontal-band'; - if (rect.height >= rect.width * 2.5) return 'vertical-band'; + if ( + rect.width >= imageWidth * LARGE_AREA_MIN_WIDTH_RATIO && + rect.height >= imageHeight * LARGE_AREA_MIN_HEIGHT_RATIO + ) { + return 'large-area'; + } + if (rect.width >= rect.height * BAND_MIN_ASPECT_RATIO) return 'horizontal-band'; + if (rect.height >= rect.width * BAND_MIN_ASPECT_RATIO) return 'vertical-band'; return 'compact'; } function describeRegionSize(regionArea: number, totalPixels: number): ScreenshotDiffRegion['size'] { const areaRatio = regionArea / totalPixels; - if (areaRatio >= 0.04) return 'large'; - if (areaRatio >= 0.01) return 'medium'; + if (areaRatio >= LARGE_REGION_MIN_AREA_RATIO) return 'large'; + if (areaRatio >= MEDIUM_REGION_MIN_AREA_RATIO) return 'medium'; return 'small'; } From b5440fa8081c4b88aca9e6a61e010768eab1d793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 12:18:51 +0200 Subject: [PATCH 7/8] feat: add compact screenshot diff hints --- .../agent-device/references/verification.md | 2 +- src/utils/__tests__/output.test.ts | 34 ++++- .../__tests__/screenshot-diff-ocr.test.ts | 99 +++++++++++++- src/utils/output.ts | 77 ++++++++++- src/utils/screenshot-diff-non-text.ts | 91 ++++++++++++- src/utils/screenshot-diff-ocr.ts | 128 ++++++++++++++++++ src/utils/screenshot-diff.ts | 15 +- website/docs/docs/commands.md | 2 +- 8 files changed, 436 insertions(+), 12 deletions(-) diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index c6c9066d6..a2922f9e6 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -59,7 +59,7 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --ove - Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. -- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas such as moved labels and possible text metric mismatches. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. +- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, added/removed text candidates, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. diff --git a/src/utils/__tests__/output.test.ts b/src/utils/__tests__/output.test.ts index a073a0f66..f2e44b7e7 100644 --- a/src/utils/__tests__/output.test.ts +++ b/src/utils/__tests__/output.test.ts @@ -706,6 +706,29 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' possibleTextMetricMismatch: true, }, ], + addedText: [ + { + text: 'Accessibility', + rect: { x: 220, y: 2520, width: 220, height: 40 }, + confidence: 93, + }, + ], + removedText: [ + { + text: 'VPN', + rect: { x: 120, y: 2100, width: 90, height: 36 }, + confidence: 96, + }, + ], + movementClusters: [ + { + texts: ['Wi-Fi', 'Bluetooth'], + averageDelta: { x: 10, y: 12 }, + xRange: { min: 10, max: 12 }, + yRange: { min: 10, max: 14 }, + confidence: 90, + }, + ], }, nonTextDeltas: [ { @@ -724,6 +747,15 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' assert.match(text, /Current overlay:/); assert.match(text, /diff\.current-overlay\.png \(1 refs\)/); assert.match(text, /500 different \/ 10000 total pixels/); + assert.match(text, /Hints:/); + assert.match( + text, + /text movement cluster: "Wi-Fi", "Bluetooth" dx=\+10\.\.\+12px dy=\+10\.\.\+14px/, + ); + assert.match(text, /added text candidates: "Accessibility" at x=220,y=2520/); + assert.match(text, /removed text candidates: "VPN" at x=120,y=2100/); + assert.match(text, /non-text controls\/boundaries: icon near "Wi-Fi" r1/); + assert.match(text, /largest changed region: r1 top-left 70% of diff, brighter/); assert.match(text, /Changed regions:/); assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, change=brighter/); assert.match( @@ -741,7 +773,7 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' ); assert.match( text, - /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| 94 \| possible-text-metric-mismatch/, + /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| 94 \| ocr-bbox-size-change/, ); assert.match(text, /Non-text visual deltas \(showing 1\/1; px\):/); assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText/); diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts index 5c0a56fb0..139d502aa 100644 --- a/src/utils/__tests__/screenshot-diff-ocr.test.ts +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -7,6 +7,8 @@ import { matchOcrBlocks, parseTesseractTsv, summarizeScreenshotOcr, + summarizeOcrMovementClusters, + summarizeOcrTextChanges, } from '../screenshot-diff-ocr.ts'; test('parseTesseractTsv groups word rows into text line blocks', () => { @@ -44,7 +46,7 @@ test('parseTesseractTsv groups word rows into text line blocks', () => { }); }); -test('matchOcrBlocks reports movement and possible text metric mismatch', () => { +test('matchOcrBlocks reports movement and OCR bbox size change', () => { const matches = matchOcrBlocks( [ { @@ -69,6 +71,101 @@ test('matchOcrBlocks reports movement and possible text metric mismatch', () => assert.equal(matches[0]?.possibleTextMetricMismatch, true); }); +test('summarizeOcrTextChanges normalizes noisy OCR labels before reporting candidates', () => { + const changes = summarizeOcrTextChanges( + [ + { + text: 'Airplane Mode', + confidence: 95, + rect: { x: 100, y: 200, width: 80, height: 20 }, + normalizedRect: { x: 25, y: 25, width: 20, height: 2.5 }, + }, + { + text: '2) Personal Hotspot', + confidence: 90, + rect: { x: 100, y: 260, width: 160, height: 20 }, + normalizedRect: { x: 25, y: 32.5, width: 40, height: 2.5 }, + }, + { + text: 'Removed Row', + confidence: 91, + rect: { x: 100, y: 320, width: 140, height: 20 }, + normalizedRect: { x: 25, y: 40, width: 35, height: 2.5 }, + }, + { + text: '4:44', + confidence: 96, + rect: { x: 10, y: 20, width: 60, height: 20 }, + normalizedRect: { x: 2.5, y: 2.5, width: 15, height: 2.5 }, + }, + ], + [ + { + text: 'Airplane Mode @e~', + confidence: 91, + rect: { x: 120, y: 210, width: 120, height: 20 }, + normalizedRect: { x: 30, y: 26.25, width: 30, height: 2.5 }, + }, + { + text: 'Personal Hotspot', + confidence: 92, + rect: { x: 120, y: 270, width: 140, height: 20 }, + normalizedRect: { x: 30, y: 33.75, width: 35, height: 2.5 }, + }, + { + text: 'Added Row', + confidence: 93, + rect: { x: 120, y: 340, width: 110, height: 20 }, + normalizedRect: { x: 30, y: 42.5, width: 27.5, height: 2.5 }, + }, + ], + 800, + ); + + assert.deepEqual( + changes.addedText.map((change) => change.text), + ['Added Row'], + ); + assert.deepEqual( + changes.removedText.map((change) => change.text), + ['Removed Row'], + ); +}); + +test('summarizeOcrMovementClusters groups repeated x-axis text movement', () => { + const clusters = summarizeOcrMovementClusters([ + { + text: 'Wi-Fi', + baselineRect: { x: 100, y: 200, width: 50, height: 20 }, + currentRect: { x: 286, y: 120, width: 50, height: 20 }, + delta: { x: 186, y: -80, width: 0, height: 0 }, + confidence: 96, + possibleTextMetricMismatch: false, + }, + { + text: 'Bluetooth', + baselineRect: { x: 100, y: 260, width: 90, height: 20 }, + currentRect: { x: 284, y: 190, width: 90, height: 20 }, + delta: { x: 184, y: -70, width: 0, height: 0 }, + confidence: 90, + possibleTextMetricMismatch: false, + }, + { + text: 'Search', + baselineRect: { x: 100, y: 500, width: 90, height: 20 }, + currentRect: { x: 52, y: 560, width: 90, height: 20 }, + delta: { x: -48, y: 60, width: 0, height: 0 }, + confidence: 94, + possibleTextMetricMismatch: false, + }, + ]); + + assert.equal(clusters.length, 1); + assert.deepEqual(clusters[0]?.texts, ['Wi-Fi', 'Bluetooth']); + assert.deepEqual(clusters[0]?.xRange, { min: 184, max: 186 }); + assert.deepEqual(clusters[0]?.yRange, { min: -80, max: -70 }); +}); + test('summarizeScreenshotOcr returns undefined when tesseract exits non-zero', async () => { const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'ocr-test-')); const binDir = path.join(dir, 'bin'); diff --git a/src/utils/output.ts b/src/utils/output.ts index 35383439a..547d239ba 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -242,6 +242,12 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { lines.push(` ${diffCount} different / ${totalPixels} total pixels`); } + const hints = !match && !dimensionMismatch ? formatScreenshotDiffHints(data) : []; + if (hints.length > 0) { + lines.push(' Hints:'); + for (const hint of hints) lines.push(` - ${hint}`); + } + const regions = Array.isArray(data.regions) ? data.regions : []; if (!match && !dimensionMismatch && regions.length > 0) { lines.push(' Changed regions:'); @@ -288,7 +294,7 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { `${formatSignedPixels(delta.width)},${formatSignedPixels(delta.height)} | ` + `${formatRect(ocrMatch.baselineRect)} | ${formatRect(ocrMatch.currentRect)} | ` + `${ocrMatch.confidence} | ` + - `${ocrMatch.possibleTextMetricMismatch ? 'possible-text-metric-mismatch' : '-'}`, + `${ocrMatch.possibleTextMetricMismatch ? 'ocr-bbox-size-change' : '-'}`, ); } } @@ -320,6 +326,75 @@ function formatSignedPixels(value: number): string { return value > 0 ? `+${value}` : String(value); } +function formatScreenshotDiffHints(data: ScreenshotDiffResult): string[] { + const hints: string[] = []; + const clusters = data.ocr?.movementClusters ?? []; + for (const cluster of clusters.slice(0, 2)) { + hints.push( + `text movement cluster: ${formatQuotedList(cluster.texts)} dx=${formatRange(cluster.xRange)}px ` + + `dy=${formatRange(cluster.yRange)}px`, + ); + } + + const addedText = data.ocr?.addedText ?? []; + if (addedText.length > 0) { + hints.push(`added text candidates: ${formatTextChanges(addedText)}`); + } + + const removedText = data.ocr?.removedText ?? []; + if (removedText.length > 0) { + hints.push(`removed text candidates: ${formatTextChanges(removedText)}`); + } + + const controlDeltas = (data.nonTextDeltas ?? []) + .filter((delta) => ['icon', 'toggle', 'chevron', 'separator'].includes(delta.likelyKind)) + .slice(0, 3); + if (controlDeltas.length > 0) { + hints.push(`non-text controls/boundaries: ${controlDeltas.map(formatNonTextHint).join('; ')}`); + } + + const largestRegion = data.regions?.[0]; + if (largestRegion) { + hints.push( + `largest changed region: r${largestRegion.index} ${largestRegion.location} ` + + `${largestRegion.shareOfDiffPercentage}% of diff, ${largestRegion.dominantChange}`, + ); + } + + return hints.slice(0, 6); +} + +function formatTextChanges( + changes: Array<{ text: string; rect: { x: number; y: number; width: number; height: number } }>, +): string { + return changes + .slice(0, 3) + .map((change) => `${JSON.stringify(change.text)} at x=${change.rect.x},y=${change.rect.y}`) + .join(', '); +} + +function formatNonTextHint(delta: { + likelyKind: string; + nearestText?: string; + regionIndex?: number; +}): string { + const anchor = delta.nearestText ? ` near ${JSON.stringify(delta.nearestText)}` : ''; + const region = delta.regionIndex ? ` r${delta.regionIndex}` : ''; + return `${delta.likelyKind}${anchor}${region}`; +} + +function formatRange(range: { min: number; max: number }): string { + return range.min === range.max + ? formatSignedPixels(range.min) + : `${formatSignedPixels(range.min)}..${formatSignedPixels(range.max)}`; +} + +function formatQuotedList(values: string[]): string { + const shown = values.slice(0, 4).map((value) => JSON.stringify(value)); + const suffix = values.length > shown.length ? ` +${values.length - shown.length} more` : ''; + return `${shown.join(', ')}${suffix}`; +} + function formatScreenshotRegionDetails(region: ScreenshotDiffRegion): string | null { const details = [ region.size ? `size=${region.size}` : null, diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts index 39afef1b6..6a52a21b0 100644 --- a/src/utils/screenshot-diff-non-text.ts +++ b/src/utils/screenshot-diff-non-text.ts @@ -67,6 +67,11 @@ type ScoredNonTextDelta = Omit & score: number; }; +type OcrRow = { + rect: Rect; + blocks: ScreenshotOcrBlock[]; +}; + export function summarizeNonTextDiffDeltas(params: { diffMask: Uint8Array; width: number; @@ -78,11 +83,12 @@ export function summarizeNonTextDiffDeltas(params: { const maskedDiff = maskOcrText(params.diffMask, params.width, params.height, params.ocr); const rawComponents = findConnectedComponents(maskedDiff, params.width, params.height); const mergedComponents = mergeNearbyComponents(rawComponents, MERGE_GAP_PX); - const textBlocks = getOcrBlocks(params.ocr); + const currentRows = groupOcrRows(params.ocr?.currentBlocksRaw ?? []); + const fallbackTextBlocks = getOcrBlocks(params.ocr); return ( mergedComponents .filter(hasUsefulComponentSize) - .map((component) => toNonTextDelta(component, params, textBlocks)) + .map((component) => toNonTextDelta(component, params, currentRows, fallbackTextBlocks)) // Status bars and top chrome tend to produce noisy residuals around time, // signal, and battery text; changed regions still report that area. .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) @@ -190,12 +196,13 @@ function toNonTextDelta( height: number; regions: ScreenshotDiffRegion[]; }, - textBlocks: ScreenshotOcrBlock[], + currentRows: OcrRow[], + fallbackTextBlocks: ScreenshotOcrBlock[], ): ScoredNonTextDelta { const rect = componentToRect(component); const regionIndex = findContainingRegionIndex(rect, params.regions); - const nearestText = findNearestText(rect, textBlocks); - const slot = classifySlot(rect, nearestText?.block.rect, params.width); + const textAnchor = findTextAnchor(rect, currentRows, fallbackTextBlocks); + const slot = classifySlot(rect, textAnchor?.block.rect, params.width); const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels, params); const scoreParams = { ...(regionIndex ? { regionIndex } : {}), @@ -208,7 +215,7 @@ function toNonTextDelta( slot, likelyKind, rect, - ...(nearestText ? { nearestText: nearestText.block.text } : {}), + ...(textAnchor ? { nearestText: cleanOcrAnchorText(textAnchor.block.text) } : {}), score: scoreNonTextDelta(scoreParams, component.differentPixels, params), }; } @@ -331,6 +338,50 @@ function findContainingRegionIndex( return bestRegion?.index; } +function findTextAnchor( + rect: Rect, + currentRows: OcrRow[], + fallbackTextBlocks: ScreenshotOcrBlock[], +): { block: ScreenshotOcrBlock; distance: number } | undefined { + const row = findOverlappingRow(rect, currentRows); + if (row) return findNearestText(rect, row.blocks); + return findNearestText(rect, fallbackTextBlocks); +} + +function findOverlappingRow(rect: Rect, rows: OcrRow[]): OcrRow | undefined { + let bestRow: OcrRow | undefined; + let bestOverlap = 0; + for (const row of rows) { + const overlap = verticalOverlap(rect, row.rect); + if (overlap <= bestOverlap) continue; + bestOverlap = overlap; + bestRow = row; + } + return bestRow; +} + +function groupOcrRows(blocks: ScreenshotOcrBlock[]): OcrRow[] { + const rows: OcrRow[] = []; + for (const block of [...blocks].sort((left, right) => left.rect.y - right.rect.y)) { + const row = rows.find((candidate) => blocksShareRow(candidate.rect, block.rect)); + if (!row) { + rows.push({ rect: block.rect, blocks: [block] }); + continue; + } + row.blocks.push(block); + row.blocks.sort((left, right) => left.rect.x - right.rect.x); + row.rect = unionRects([row.rect, block.rect]); + } + return rows; +} + +function blocksShareRow(left: Rect, right: Rect): boolean { + const overlap = verticalOverlap(left, right); + if (overlap > 0) return true; + const centerDistance = Math.abs(rectCenter(left).y - rectCenter(right).y); + return centerDistance <= Math.max(left.height, right.height) * 0.5; +} + function findNearestText( rect: Rect, textBlocks: ScreenshotOcrBlock[], @@ -345,10 +396,31 @@ function findNearestText( return nearest; } +function unionRects(rects: Rect[]): Rect { + let minX = Number.POSITIVE_INFINITY; + let minY = Number.POSITIVE_INFINITY; + let maxX = Number.NEGATIVE_INFINITY; + let maxY = Number.NEGATIVE_INFINITY; + for (const rect of rects) { + minX = Math.min(minX, rect.x); + minY = Math.min(minY, rect.y); + maxX = Math.max(maxX, rect.x + rect.width); + maxY = Math.max(maxY, rect.y + rect.height); + } + return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; +} + function getOcrBlocks(ocr: ScreenshotOcrAnalysis | undefined): ScreenshotOcrBlock[] { return ocr ? [...ocr.baselineBlocksRaw, ...ocr.currentBlocksRaw] : []; } +function cleanOcrAnchorText(text: string): string { + return text + .trim() + .replace(/^[^\p{L}\p{N}]+/u, '') + .replace(/^\p{L}\s+/u, ''); +} + function hasUsefulComponentSize(component: MutableComponent): boolean { const rect = componentToRect(component); return ( @@ -410,6 +482,13 @@ function intersectArea(left: Rect, right: Rect): number { return (maxX - minX) * (maxY - minY); } +function verticalOverlap(left: Rect, right: Rect): number { + return Math.max( + 0, + Math.min(left.y + left.height, right.y + right.height) - Math.max(left.y, right.y), + ); +} + function rectCenter(rect: Rect): { x: number; y: number } { return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; } diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts index 9e9f011d3..b3b163f29 100644 --- a/src/utils/screenshot-diff-ocr.ts +++ b/src/utils/screenshot-diff-ocr.ts @@ -17,11 +17,28 @@ export type ScreenshotOcrTextMatch = { possibleTextMetricMismatch: boolean; }; +export type ScreenshotOcrTextChange = { + text: string; + rect: Rect; + confidence: number; +}; + +export type ScreenshotOcrMovementCluster = { + texts: string[]; + averageDelta: { x: number; y: number }; + xRange: { min: number; max: number }; + yRange: { min: number; max: number }; + confidence: number; +}; + export type ScreenshotOcrSummary = { provider: 'tesseract'; baselineBlocks: number; currentBlocks: number; matches: ScreenshotOcrTextMatch[]; + addedText?: ScreenshotOcrTextChange[]; + removedText?: ScreenshotOcrTextChange[]; + movementClusters?: ScreenshotOcrMovementCluster[]; }; export type ScreenshotOcrAnalysis = ScreenshotOcrSummary & { @@ -38,6 +55,15 @@ type TesseractWord = { const OCR_TIMEOUT_MS = 10_000; const MAX_OCR_MATCHES = 12; +const MAX_OCR_TEXT_CHANGES = 5; +const MIN_OCR_TEXT_PRESENCE_CONFIDENCE = 50; +const MIN_OCR_TEXT_CHANGE_CONFIDENCE = 80; +const MIN_TEXT_CHANGE_CANONICAL_LENGTH = 3; +const TOP_CHROME_TEXT_CHANGE_IGNORE_Y_RATIO = 0.08; +const MAX_MOVEMENT_CLUSTERS = 4; +const MIN_CLUSTERED_MATCHES = 2; +const MOVEMENT_CLUSTER_MAX_X_SPREAD_PX = 32; +const MOVEMENT_CLUSTER_MAX_Y_SPREAD_PX = 60; // OCR text matching uses small generic movement/shape thresholds; the fixed gap // is only a floor before falling back to word-height-relative spacing. const MIN_MEANINGFUL_DELTA_PX = 2; @@ -63,6 +89,8 @@ export async function summarizeScreenshotOcr(params: { const baselineBlocks = parseTesseractTsv(baselineResult.stdout, params.width, params.height); const currentBlocks = parseTesseractTsv(currentResult.stdout, params.width, params.height); const matches = matchOcrBlocks(baselineBlocks, currentBlocks); + const textChanges = summarizeOcrTextChanges(baselineBlocks, currentBlocks, params.height); + const movementClusters = summarizeOcrMovementClusters(matches); if (baselineBlocks.length === 0 && currentBlocks.length === 0) return undefined; return { @@ -72,6 +100,9 @@ export async function summarizeScreenshotOcr(params: { baselineBlocksRaw: baselineBlocks, currentBlocksRaw: currentBlocks, matches, + ...(textChanges.addedText.length > 0 ? { addedText: textChanges.addedText } : {}), + ...(textChanges.removedText.length > 0 ? { removedText: textChanges.removedText } : {}), + ...(movementClusters.length > 0 ? { movementClusters } : {}), }; } catch { return undefined; @@ -84,6 +115,9 @@ export function toScreenshotOcrSummary(analysis: ScreenshotOcrAnalysis): Screens baselineBlocks: analysis.baselineBlocks, currentBlocks: analysis.currentBlocks, matches: analysis.matches, + ...(analysis.addedText ? { addedText: analysis.addedText } : {}), + ...(analysis.removedText ? { removedText: analysis.removedText } : {}), + ...(analysis.movementClusters ? { movementClusters: analysis.movementClusters } : {}), }; } @@ -289,6 +323,100 @@ function scoreOcrMatch(match: ScreenshotOcrTextMatch): number { ); } +export function summarizeOcrTextChanges( + baselineBlocks: ScreenshotOcrBlock[], + currentBlocks: ScreenshotOcrBlock[], + imageHeight: number, +): { addedText: ScreenshotOcrTextChange[]; removedText: ScreenshotOcrTextChange[] } { + const baselineCandidates = toTextChangeCandidates(baselineBlocks, imageHeight); + const currentCandidates = toTextChangeCandidates(currentBlocks, imageHeight); + const baselineKeys = new Set(baselineCandidates.map((candidate) => candidate.key)); + const currentKeys = new Set(currentCandidates.map((candidate) => candidate.key)); + return { + addedText: currentCandidates + .filter((candidate) => !baselineKeys.has(candidate.key)) + .filter((candidate) => candidate.confidence >= MIN_OCR_TEXT_CHANGE_CONFIDENCE) + .map(toTextChange) + .slice(0, MAX_OCR_TEXT_CHANGES), + removedText: baselineCandidates + .filter((candidate) => !currentKeys.has(candidate.key)) + .filter((candidate) => candidate.confidence >= MIN_OCR_TEXT_CHANGE_CONFIDENCE) + .map(toTextChange) + .slice(0, MAX_OCR_TEXT_CHANGES), + }; +} + +export function summarizeOcrMovementClusters( + matches: ScreenshotOcrTextMatch[], +): ScreenshotOcrMovementCluster[] { + const clusters: ScreenshotOcrTextMatch[][] = []; + for (const match of [...matches].sort( + (left, right) => left.currentRect.y - right.currentRect.y, + )) { + const cluster = clusters.find( + (candidate) => + Math.abs(match.delta.x - average(candidate.map((item) => item.delta.x))) <= + MOVEMENT_CLUSTER_MAX_X_SPREAD_PX, + ); + if (cluster) cluster.push(match); + else clusters.push([match]); + } + + return clusters + .filter((cluster) => cluster.length >= MIN_CLUSTERED_MATCHES) + .map(toMovementCluster) + .filter( + (cluster) => cluster.yRange.max - cluster.yRange.min <= MOVEMENT_CLUSTER_MAX_Y_SPREAD_PX, + ) + .sort((left, right) => scoreMovementCluster(right) - scoreMovementCluster(left)) + .slice(0, MAX_MOVEMENT_CLUSTERS); +} + +function toMovementCluster(matches: ScreenshotOcrTextMatch[]): ScreenshotOcrMovementCluster { + const xDeltas = matches.map((match) => match.delta.x); + const yDeltas = matches.map((match) => match.delta.y); + return { + texts: matches.map((match) => match.text), + averageDelta: { x: Math.round(average(xDeltas)), y: Math.round(average(yDeltas)) }, + xRange: { min: Math.min(...xDeltas), max: Math.max(...xDeltas) }, + yRange: { min: Math.min(...yDeltas), max: Math.max(...yDeltas) }, + confidence: Math.round(Math.min(...matches.map((match) => match.confidence)) * 100) / 100, + }; +} + +function toTextChangeCandidates( + blocks: ScreenshotOcrBlock[], + imageHeight: number, +): Array { + return blocks + .filter( + (block) => + block.confidence >= MIN_OCR_TEXT_PRESENCE_CONFIDENCE && + block.rect.y >= imageHeight * TOP_CHROME_TEXT_CHANGE_IGNORE_Y_RATIO, + ) + .map((block) => ({ ...block, key: canonicalTextChangeKey(block.text) })) + .filter((block) => block.key.length >= MIN_TEXT_CHANGE_CANONICAL_LENGTH) + .sort((left, right) => left.rect.y - right.rect.y || left.rect.x - right.rect.x); +} + +function toTextChange( + candidate: ScreenshotOcrTextChange & { key: string }, +): ScreenshotOcrTextChange { + return { text: candidate.text, rect: candidate.rect, confidence: candidate.confidence }; +} + +function scoreMovementCluster(cluster: ScreenshotOcrMovementCluster): number { + return Math.abs(cluster.averageDelta.x) * 2 + Math.abs(cluster.averageDelta.y); +} + +function canonicalTextChangeKey(text: string): string { + const tokens = text + .toLowerCase() + .match(/[\p{L}\p{N}]+/gu) + ?.filter((token) => /[\p{L}]/u.test(token) && token.length > 1); + return tokens?.join(' ') ?? ''; +} + function unionRects(rects: Rect[]): Rect { let minX = Number.POSITIVE_INFINITY; let minY = Number.POSITIVE_INFINITY; diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index 2ca7b6ee4..c02d471a0 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -149,7 +149,9 @@ export async function compareScreenshots( }) : undefined; const ocr = - ocrAnalysis && ocrAnalysis.matches.length > 0 ? toScreenshotOcrSummary(ocrAnalysis) : undefined; + ocrAnalysis && hasScreenshotOcrSummary(ocrAnalysis) + ? toScreenshotOcrSummary(ocrAnalysis) + : undefined; const nonTextDeltas = differentPixels > 0 && ocrAnalysis ? summarizeNonTextDiffDeltas({ @@ -178,6 +180,17 @@ export async function compareScreenshots( }; } +function hasScreenshotOcrSummary( + ocr: NonNullable>>, +): boolean { + return ( + ocr.matches.length > 0 || + (ocr.addedText?.length ?? 0) > 0 || + (ocr.removedText?.length ?? 0) > 0 || + (ocr.movementClusters?.length ?? 0) > 0 + ); +} + async function validateFileExists(filePath: string, errorMessage: string): Promise { try { await fs.access(filePath); diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index f1fc44efc..5d01fa6df 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -554,7 +554,7 @@ agent-device record stop # Stop active recording - Recordings always produce a video artifact. When touch visualization is enabled, they also produce a gesture telemetry sidecar that can be used for post-processing or inspection. - `screenshot --overlay-refs` captures a fresh full snapshot and burns visible `@eN` refs plus their target rectangles into the saved PNG. - `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. JSON also includes normalized bounds. -- If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. +- If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, added/removed text candidates, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `. From 4ca21fcc0154d2ed6ef571780f65d408764d00bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 12 Apr 2026 12:29:16 +0200 Subject: [PATCH 8/8] refactor: prune screenshot diff guidance output --- .../agent-device/references/verification.md | 2 +- src/utils/__tests__/output.test.ts | 32 +++---- .../screenshot-diff-non-text.test.ts | 38 +++++++- .../__tests__/screenshot-diff-ocr.test.ts | 62 ------------- src/utils/output.ts | 34 ++----- src/utils/screenshot-diff-non-text.ts | 21 ++--- src/utils/screenshot-diff-ocr.ts | 88 +------------------ src/utils/screenshot-diff.ts | 33 +++---- website/docs/docs/commands.md | 2 +- 9 files changed, 81 insertions(+), 231 deletions(-) diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index a2922f9e6..860120a4b 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -59,7 +59,7 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --ove - Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. -- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, added/removed text candidates, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. +- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. diff --git a/src/utils/__tests__/output.test.ts b/src/utils/__tests__/output.test.ts index f2e44b7e7..b5372d97c 100644 --- a/src/utils/__tests__/output.test.ts +++ b/src/utils/__tests__/output.test.ts @@ -706,27 +706,11 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' possibleTextMetricMismatch: true, }, ], - addedText: [ - { - text: 'Accessibility', - rect: { x: 220, y: 2520, width: 220, height: 40 }, - confidence: 93, - }, - ], - removedText: [ - { - text: 'VPN', - rect: { x: 120, y: 2100, width: 90, height: 36 }, - confidence: 96, - }, - ], movementClusters: [ { texts: ['Wi-Fi', 'Bluetooth'], - averageDelta: { x: 10, y: 12 }, xRange: { min: 10, max: 12 }, yRange: { min: 10, max: 14 }, - confidence: 90, }, ], }, @@ -739,6 +723,13 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' rect: { x: 80, y: 318, width: 30, height: 30 }, nearestText: 'Wi-Fi', }, + { + index: 2, + regionIndex: 1, + slot: 'separator', + likelyKind: 'separator', + rect: { x: 90, y: 360, width: 120, height: 2 }, + }, ], }), ); @@ -752,10 +743,8 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' text, /text movement cluster: "Wi-Fi", "Bluetooth" dx=\+10\.\.\+12px dy=\+10\.\.\+14px/, ); - assert.match(text, /added text candidates: "Accessibility" at x=220,y=2520/); - assert.match(text, /removed text candidates: "VPN" at x=120,y=2100/); - assert.match(text, /non-text controls\/boundaries: icon near "Wi-Fi" r1/); - assert.match(text, /largest changed region: r1 top-left 70% of diff, brighter/); + assert.match(text, /non-text controls: icon near "Wi-Fi" r1/); + assert.match(text, /non-text boundaries: separator r1/); assert.match(text, /Changed regions:/); assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, change=brighter/); assert.match( @@ -775,9 +764,10 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' text, /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| 94 \| ocr-bbox-size-change/, ); - assert.match(text, /Non-text visual deltas \(showing 1\/1; px\):/); + assert.match(text, /Non-text visual deltas \(showing 2\/2; px\):/); assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText/); assert.match(text, /1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi"/); + assert.match(text, /2 \| r1 \| separator \| separator \| x=90,y=360,w=120,h=2 \| -/); assert.equal(text.includes('\x1b['), false); }); diff --git a/src/utils/__tests__/screenshot-diff-non-text.test.ts b/src/utils/__tests__/screenshot-diff-non-text.test.ts index b644922f1..e32fce5a9 100644 --- a/src/utils/__tests__/screenshot-diff-non-text.test.ts +++ b/src/utils/__tests__/screenshot-diff-non-text.test.ts @@ -47,7 +47,8 @@ test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residua provider: 'tesseract', baselineBlocks: 1, currentBlocks: 1, - baselineBlocksRaw: [ + baselineBlocksRaw: [], + currentBlocksRaw: [ { text: 'Wi-Fi', confidence: 90, @@ -55,7 +56,6 @@ test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residua normalizedRect: { x: 30.91, y: 23.33, width: 27.27, height: 20 }, }, ], - currentBlocksRaw: [], matches: [], }, }); @@ -68,6 +68,40 @@ test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residua assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); }); +test('summarizeNonTextDiffDeltas uses overlapping baseline text when current OCR misses a row', () => { + const width = 220; + const height = 120; + const diffMask = new Uint8Array(width * height); + paintMaskRect(diffMask, width, { x: 20, y: 30, width: 20, height: 20 }); + + const deltas = summarizeNonTextDiffDeltas({ + diffMask, + width, + height, + regions: [], + ocr: { + provider: 'tesseract', + baselineBlocks: 1, + currentBlocks: 0, + baselineBlocksRaw: [ + { + text: 'Wi-Fi', + confidence: 90, + rect: { x: 68, y: 28, width: 60, height: 24 }, + normalizedRect: { x: 30.91, y: 23.33, width: 27.27, height: 20 }, + }, + ], + currentBlocksRaw: [], + matches: [], + }, + }); + + assert.equal(deltas.length, 1); + assert.equal(deltas[0]?.slot, 'leading'); + assert.equal(deltas[0]?.likelyKind, 'icon'); + assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); +}); + test('summarizeNonTextDiffDeltas omits broad background residuals', () => { const width = 220; const height = 120; diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts index 139d502aa..6f6204dea 100644 --- a/src/utils/__tests__/screenshot-diff-ocr.test.ts +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -8,7 +8,6 @@ import { parseTesseractTsv, summarizeScreenshotOcr, summarizeOcrMovementClusters, - summarizeOcrTextChanges, } from '../screenshot-diff-ocr.ts'; test('parseTesseractTsv groups word rows into text line blocks', () => { @@ -71,67 +70,6 @@ test('matchOcrBlocks reports movement and OCR bbox size change', () => { assert.equal(matches[0]?.possibleTextMetricMismatch, true); }); -test('summarizeOcrTextChanges normalizes noisy OCR labels before reporting candidates', () => { - const changes = summarizeOcrTextChanges( - [ - { - text: 'Airplane Mode', - confidence: 95, - rect: { x: 100, y: 200, width: 80, height: 20 }, - normalizedRect: { x: 25, y: 25, width: 20, height: 2.5 }, - }, - { - text: '2) Personal Hotspot', - confidence: 90, - rect: { x: 100, y: 260, width: 160, height: 20 }, - normalizedRect: { x: 25, y: 32.5, width: 40, height: 2.5 }, - }, - { - text: 'Removed Row', - confidence: 91, - rect: { x: 100, y: 320, width: 140, height: 20 }, - normalizedRect: { x: 25, y: 40, width: 35, height: 2.5 }, - }, - { - text: '4:44', - confidence: 96, - rect: { x: 10, y: 20, width: 60, height: 20 }, - normalizedRect: { x: 2.5, y: 2.5, width: 15, height: 2.5 }, - }, - ], - [ - { - text: 'Airplane Mode @e~', - confidence: 91, - rect: { x: 120, y: 210, width: 120, height: 20 }, - normalizedRect: { x: 30, y: 26.25, width: 30, height: 2.5 }, - }, - { - text: 'Personal Hotspot', - confidence: 92, - rect: { x: 120, y: 270, width: 140, height: 20 }, - normalizedRect: { x: 30, y: 33.75, width: 35, height: 2.5 }, - }, - { - text: 'Added Row', - confidence: 93, - rect: { x: 120, y: 340, width: 110, height: 20 }, - normalizedRect: { x: 30, y: 42.5, width: 27.5, height: 2.5 }, - }, - ], - 800, - ); - - assert.deepEqual( - changes.addedText.map((change) => change.text), - ['Added Row'], - ); - assert.deepEqual( - changes.removedText.map((change) => change.text), - ['Removed Row'], - ); -}); - test('summarizeOcrMovementClusters groups repeated x-axis text movement', () => { const clusters = summarizeOcrMovementClusters([ { diff --git a/src/utils/output.ts b/src/utils/output.ts index 547d239ba..56fd7f6a5 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -336,43 +336,23 @@ function formatScreenshotDiffHints(data: ScreenshotDiffResult): string[] { ); } - const addedText = data.ocr?.addedText ?? []; - if (addedText.length > 0) { - hints.push(`added text candidates: ${formatTextChanges(addedText)}`); - } - - const removedText = data.ocr?.removedText ?? []; - if (removedText.length > 0) { - hints.push(`removed text candidates: ${formatTextChanges(removedText)}`); - } - const controlDeltas = (data.nonTextDeltas ?? []) - .filter((delta) => ['icon', 'toggle', 'chevron', 'separator'].includes(delta.likelyKind)) + .filter((delta) => ['icon', 'toggle', 'chevron'].includes(delta.likelyKind)) .slice(0, 3); if (controlDeltas.length > 0) { - hints.push(`non-text controls/boundaries: ${controlDeltas.map(formatNonTextHint).join('; ')}`); + hints.push(`non-text controls: ${controlDeltas.map(formatNonTextHint).join('; ')}`); } - const largestRegion = data.regions?.[0]; - if (largestRegion) { - hints.push( - `largest changed region: r${largestRegion.index} ${largestRegion.location} ` + - `${largestRegion.shareOfDiffPercentage}% of diff, ${largestRegion.dominantChange}`, - ); + const boundaryDeltas = (data.nonTextDeltas ?? []) + .filter((delta) => delta.likelyKind === 'separator') + .slice(0, 2); + if (boundaryDeltas.length > 0) { + hints.push(`non-text boundaries: ${boundaryDeltas.map(formatNonTextHint).join('; ')}`); } return hints.slice(0, 6); } -function formatTextChanges( - changes: Array<{ text: string; rect: { x: number; y: number; width: number; height: number } }>, -): string { - return changes - .slice(0, 3) - .map((change) => `${JSON.stringify(change.text)} at x=${change.rect.x},y=${change.rect.y}`) - .join(', '); -} - function formatNonTextHint(delta: { likelyKind: string; nearestText?: string; diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts index 6a52a21b0..7131a6537 100644 --- a/src/utils/screenshot-diff-non-text.ts +++ b/src/utils/screenshot-diff-non-text.ts @@ -84,11 +84,11 @@ export function summarizeNonTextDiffDeltas(params: { const rawComponents = findConnectedComponents(maskedDiff, params.width, params.height); const mergedComponents = mergeNearbyComponents(rawComponents, MERGE_GAP_PX); const currentRows = groupOcrRows(params.ocr?.currentBlocksRaw ?? []); - const fallbackTextBlocks = getOcrBlocks(params.ocr); + const baselineRows = groupOcrRows(params.ocr?.baselineBlocksRaw ?? []); return ( mergedComponents .filter(hasUsefulComponentSize) - .map((component) => toNonTextDelta(component, params, currentRows, fallbackTextBlocks)) + .map((component) => toNonTextDelta(component, params, currentRows, baselineRows)) // Status bars and top chrome tend to produce noisy residuals around time, // signal, and battery text; changed regions still report that area. .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) @@ -197,11 +197,11 @@ function toNonTextDelta( regions: ScreenshotDiffRegion[]; }, currentRows: OcrRow[], - fallbackTextBlocks: ScreenshotOcrBlock[], + baselineRows: OcrRow[], ): ScoredNonTextDelta { const rect = componentToRect(component); const regionIndex = findContainingRegionIndex(rect, params.regions); - const textAnchor = findTextAnchor(rect, currentRows, fallbackTextBlocks); + const textAnchor = findTextAnchor(rect, currentRows, baselineRows); const slot = classifySlot(rect, textAnchor?.block.rect, params.width); const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels, params); const scoreParams = { @@ -341,11 +341,12 @@ function findContainingRegionIndex( function findTextAnchor( rect: Rect, currentRows: OcrRow[], - fallbackTextBlocks: ScreenshotOcrBlock[], + baselineRows: OcrRow[], ): { block: ScreenshotOcrBlock; distance: number } | undefined { - const row = findOverlappingRow(rect, currentRows); - if (row) return findNearestText(rect, row.blocks); - return findNearestText(rect, fallbackTextBlocks); + const currentRow = findOverlappingRow(rect, currentRows); + if (currentRow) return findNearestText(rect, currentRow.blocks); + const baselineRow = findOverlappingRow(rect, baselineRows); + return baselineRow ? findNearestText(rect, baselineRow.blocks) : undefined; } function findOverlappingRow(rect: Rect, rows: OcrRow[]): OcrRow | undefined { @@ -410,10 +411,6 @@ function unionRects(rects: Rect[]): Rect { return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; } -function getOcrBlocks(ocr: ScreenshotOcrAnalysis | undefined): ScreenshotOcrBlock[] { - return ocr ? [...ocr.baselineBlocksRaw, ...ocr.currentBlocksRaw] : []; -} - function cleanOcrAnchorText(text: string): string { return text .trim() diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts index b3b163f29..f0dffea0f 100644 --- a/src/utils/screenshot-diff-ocr.ts +++ b/src/utils/screenshot-diff-ocr.ts @@ -17,18 +17,10 @@ export type ScreenshotOcrTextMatch = { possibleTextMetricMismatch: boolean; }; -export type ScreenshotOcrTextChange = { - text: string; - rect: Rect; - confidence: number; -}; - export type ScreenshotOcrMovementCluster = { texts: string[]; - averageDelta: { x: number; y: number }; xRange: { min: number; max: number }; yRange: { min: number; max: number }; - confidence: number; }; export type ScreenshotOcrSummary = { @@ -36,8 +28,6 @@ export type ScreenshotOcrSummary = { baselineBlocks: number; currentBlocks: number; matches: ScreenshotOcrTextMatch[]; - addedText?: ScreenshotOcrTextChange[]; - removedText?: ScreenshotOcrTextChange[]; movementClusters?: ScreenshotOcrMovementCluster[]; }; @@ -55,11 +45,6 @@ type TesseractWord = { const OCR_TIMEOUT_MS = 10_000; const MAX_OCR_MATCHES = 12; -const MAX_OCR_TEXT_CHANGES = 5; -const MIN_OCR_TEXT_PRESENCE_CONFIDENCE = 50; -const MIN_OCR_TEXT_CHANGE_CONFIDENCE = 80; -const MIN_TEXT_CHANGE_CANONICAL_LENGTH = 3; -const TOP_CHROME_TEXT_CHANGE_IGNORE_Y_RATIO = 0.08; const MAX_MOVEMENT_CLUSTERS = 4; const MIN_CLUSTERED_MATCHES = 2; const MOVEMENT_CLUSTER_MAX_X_SPREAD_PX = 32; @@ -89,7 +74,6 @@ export async function summarizeScreenshotOcr(params: { const baselineBlocks = parseTesseractTsv(baselineResult.stdout, params.width, params.height); const currentBlocks = parseTesseractTsv(currentResult.stdout, params.width, params.height); const matches = matchOcrBlocks(baselineBlocks, currentBlocks); - const textChanges = summarizeOcrTextChanges(baselineBlocks, currentBlocks, params.height); const movementClusters = summarizeOcrMovementClusters(matches); if (baselineBlocks.length === 0 && currentBlocks.length === 0) return undefined; @@ -100,8 +84,6 @@ export async function summarizeScreenshotOcr(params: { baselineBlocksRaw: baselineBlocks, currentBlocksRaw: currentBlocks, matches, - ...(textChanges.addedText.length > 0 ? { addedText: textChanges.addedText } : {}), - ...(textChanges.removedText.length > 0 ? { removedText: textChanges.removedText } : {}), ...(movementClusters.length > 0 ? { movementClusters } : {}), }; } catch { @@ -109,18 +91,6 @@ export async function summarizeScreenshotOcr(params: { } } -export function toScreenshotOcrSummary(analysis: ScreenshotOcrAnalysis): ScreenshotOcrSummary { - return { - provider: analysis.provider, - baselineBlocks: analysis.baselineBlocks, - currentBlocks: analysis.currentBlocks, - matches: analysis.matches, - ...(analysis.addedText ? { addedText: analysis.addedText } : {}), - ...(analysis.removedText ? { removedText: analysis.removedText } : {}), - ...(analysis.movementClusters ? { movementClusters: analysis.movementClusters } : {}), - }; -} - export function parseTesseractTsv( tsv: string, imageWidth: number, @@ -323,29 +293,6 @@ function scoreOcrMatch(match: ScreenshotOcrTextMatch): number { ); } -export function summarizeOcrTextChanges( - baselineBlocks: ScreenshotOcrBlock[], - currentBlocks: ScreenshotOcrBlock[], - imageHeight: number, -): { addedText: ScreenshotOcrTextChange[]; removedText: ScreenshotOcrTextChange[] } { - const baselineCandidates = toTextChangeCandidates(baselineBlocks, imageHeight); - const currentCandidates = toTextChangeCandidates(currentBlocks, imageHeight); - const baselineKeys = new Set(baselineCandidates.map((candidate) => candidate.key)); - const currentKeys = new Set(currentCandidates.map((candidate) => candidate.key)); - return { - addedText: currentCandidates - .filter((candidate) => !baselineKeys.has(candidate.key)) - .filter((candidate) => candidate.confidence >= MIN_OCR_TEXT_CHANGE_CONFIDENCE) - .map(toTextChange) - .slice(0, MAX_OCR_TEXT_CHANGES), - removedText: baselineCandidates - .filter((candidate) => !currentKeys.has(candidate.key)) - .filter((candidate) => candidate.confidence >= MIN_OCR_TEXT_CHANGE_CONFIDENCE) - .map(toTextChange) - .slice(0, MAX_OCR_TEXT_CHANGES), - }; -} - export function summarizeOcrMovementClusters( matches: ScreenshotOcrTextMatch[], ): ScreenshotOcrMovementCluster[] { @@ -377,44 +324,15 @@ function toMovementCluster(matches: ScreenshotOcrTextMatch[]): ScreenshotOcrMove const yDeltas = matches.map((match) => match.delta.y); return { texts: matches.map((match) => match.text), - averageDelta: { x: Math.round(average(xDeltas)), y: Math.round(average(yDeltas)) }, xRange: { min: Math.min(...xDeltas), max: Math.max(...xDeltas) }, yRange: { min: Math.min(...yDeltas), max: Math.max(...yDeltas) }, - confidence: Math.round(Math.min(...matches.map((match) => match.confidence)) * 100) / 100, }; } -function toTextChangeCandidates( - blocks: ScreenshotOcrBlock[], - imageHeight: number, -): Array { - return blocks - .filter( - (block) => - block.confidence >= MIN_OCR_TEXT_PRESENCE_CONFIDENCE && - block.rect.y >= imageHeight * TOP_CHROME_TEXT_CHANGE_IGNORE_Y_RATIO, - ) - .map((block) => ({ ...block, key: canonicalTextChangeKey(block.text) })) - .filter((block) => block.key.length >= MIN_TEXT_CHANGE_CANONICAL_LENGTH) - .sort((left, right) => left.rect.y - right.rect.y || left.rect.x - right.rect.x); -} - -function toTextChange( - candidate: ScreenshotOcrTextChange & { key: string }, -): ScreenshotOcrTextChange { - return { text: candidate.text, rect: candidate.rect, confidence: candidate.confidence }; -} - function scoreMovementCluster(cluster: ScreenshotOcrMovementCluster): number { - return Math.abs(cluster.averageDelta.x) * 2 + Math.abs(cluster.averageDelta.y); -} - -function canonicalTextChangeKey(text: string): string { - const tokens = text - .toLowerCase() - .match(/[\p{L}\p{N}]+/gu) - ?.filter((token) => /[\p{L}]/u.test(token) && token.length > 1); - return tokens?.join(' ') ?? ''; + const averageX = (cluster.xRange.min + cluster.xRange.max) / 2; + const averageY = (cluster.yRange.min + cluster.yRange.max) / 2; + return Math.abs(averageX) * 2 + Math.abs(averageY); } function unionRects(rects: Rect[]): Rect { diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index c02d471a0..b15fca23c 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -8,11 +8,7 @@ import { summarizeNonTextDiffDeltas, type ScreenshotNonTextDelta, } from './screenshot-diff-non-text.ts'; -import { - summarizeScreenshotOcr, - toScreenshotOcrSummary, - type ScreenshotOcrSummary, -} from './screenshot-diff-ocr.ts'; +import { summarizeScreenshotOcr, type ScreenshotOcrSummary } from './screenshot-diff-ocr.ts'; import { summarizeDiffRegions, type ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; export type ScreenshotDimensionMismatch = { @@ -148,10 +144,18 @@ export async function compareScreenshots( height: baseline.height, }) : undefined; - const ocr = - ocrAnalysis && hasScreenshotOcrSummary(ocrAnalysis) - ? toScreenshotOcrSummary(ocrAnalysis) - : undefined; + const shouldIncludeOcr = + ocrAnalysis && + (ocrAnalysis.matches.length > 0 || (ocrAnalysis.movementClusters?.length ?? 0) > 0); + const ocr = shouldIncludeOcr + ? { + provider: ocrAnalysis.provider, + baselineBlocks: ocrAnalysis.baselineBlocks, + currentBlocks: ocrAnalysis.currentBlocks, + matches: ocrAnalysis.matches, + ...(ocrAnalysis.movementClusters ? { movementClusters: ocrAnalysis.movementClusters } : {}), + } + : undefined; const nonTextDeltas = differentPixels > 0 && ocrAnalysis ? summarizeNonTextDiffDeltas({ @@ -180,17 +184,6 @@ export async function compareScreenshots( }; } -function hasScreenshotOcrSummary( - ocr: NonNullable>>, -): boolean { - return ( - ocr.matches.length > 0 || - (ocr.addedText?.length ?? 0) > 0 || - (ocr.removedText?.length ?? 0) > 0 || - (ocr.movementClusters?.length ?? 0) > 0 - ); -} - async function validateFileExists(filePath: string, errorMessage: string): Promise { try { await fs.access(filePath); diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 5d01fa6df..7ca28bb3a 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -554,7 +554,7 @@ agent-device record stop # Stop active recording - Recordings always produce a video artifact. When touch visualization is enabled, they also produce a gesture telemetry sidecar that can be used for post-processing or inspection. - `screenshot --overlay-refs` captures a fresh full snapshot and burns visible `@eN` refs plus their target rectangles into the saved PNG. - `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. JSON also includes normalized bounds. -- If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, added/removed text candidates, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. +- If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `.