diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 8a38ad4d2..860120a4b 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -8,6 +8,7 @@ Open this file when the task needs evidence, regression checks, replay maintenan - `screenshot` - `diff snapshot` +- `diff screenshot` - `record` - `replay -u` - `perf` @@ -41,12 +42,27 @@ agent-device diff snapshot -i - Run `diff snapshot` to confirm the expected structural change. - Re-run full `snapshot` only when you need fresh refs. -## Visual artifacts +## Screenshot artifacts Use `screenshot` when the proof needs a rendered image instead of a structural tree. - Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot. +## Visual regression with diff screenshot + +Use `diff screenshot` when comparing the current rendered screen against a saved visual baseline. + +```bash +agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png +agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs +``` + +- Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. +- The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. +- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. +- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. +- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. + ## Session recording Use `record` for debugging, documentation, or shareable verification artifacts. diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index 41e824ff7..5dc33ea94 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -92,7 +92,25 @@ async function runCliCapture( fs.mkdirSync(path.dirname(outPath), { recursive: true }); fs.writeFileSync(outPath, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 })); } - return { ok: true, data: { path: outPath } }; + return { + ok: true, + data: { + path: outPath, + ...(req.flags?.overlayRefs + ? { + overlayRefs: [ + { + ref: 'e1', + label: 'Continue', + rect: { x: 1, y: 2, width: 3, height: 4 }, + overlayRect: { x: 1, y: 2, width: 3, height: 4 }, + center: { x: 3, y: 4 }, + }, + ], + } + : {}), + }, + }; } return { ok: true, @@ -249,11 +267,13 @@ describe('cli diff commands', () => { 'screenshot', '--baseline', baseline, + '--overlay-refs', '--threshold', '0.2', ]); assert.equal(result.code, null); // The client-backed command captures a screenshot via the daemon client + // and skips a second overlay capture when there is no diff to map. assert.equal(result.calls.length, 1); const call = result.calls[0]!; assert.equal(call.command, 'screenshot'); @@ -287,12 +307,15 @@ describe('cli diff commands', () => { const originalHome = process.env.HOME; const baselineRelative = path.join('fixtures', 'baseline.png'); const diffRelative = path.join('fixtures', 'diff.png'); + const overlayRelative = path.join('fixtures', 'diff.current-overlay.png'); const baseline = path.join(fakeHome, baselineRelative); const diffOut = path.join(fakeHome, diffRelative); + const overlayOut = path.join(fakeHome, overlayRelative); fs.mkdirSync(path.dirname(baseline), { recursive: true }); fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 })); fs.writeFileSync(diffOut, 'stale diff'); + fs.writeFileSync(overlayOut, 'stale overlay'); process.env.HOME = fakeHome; try { @@ -304,6 +327,7 @@ describe('cli diff commands', () => { `~/${baselineRelative}`, '--out', `~/${diffRelative}`, + '--overlay-refs', '--json', ], { preserveHome: true }, @@ -315,10 +339,50 @@ describe('cli diff commands', () => { assert.equal(payload.success, true); assert.equal(payload.data.match, true); assert.equal(fs.existsSync(diffOut), false); + assert.equal(fs.existsSync(overlayOut), false); } finally { if (typeof originalHome === 'string') process.env.HOME = originalHome; else delete process.env.HOME; fs.rmSync(fakeHome, { recursive: true, force: true }); } }); + + test('diff screenshot --overlay-refs writes a separate current overlay guide', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-')); + const baseline = path.join(dir, 'baseline.png'); + const diffOut = path.join(dir, 'diff.png'); + const overlayOut = path.join(dir, 'diff.current-overlay.png'); + fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 0, g: 0, b: 0 })); + + try { + const result = await runCliCapture([ + 'diff', + 'screenshot', + '--baseline', + baseline, + '--out', + diffOut, + '--overlay-refs', + '--threshold', + '0', + ]); + assert.equal(result.code, null); + assert.equal(result.calls.length, 2); + assert.equal(result.calls[0]?.command, 'screenshot'); + assert.equal(result.calls[0]?.flags?.overlayRefs, undefined); + assert.equal(result.calls[1]?.command, 'screenshot'); + assert.equal(result.calls[1]?.flags?.overlayRefs, true); + assert.equal(result.calls[1]?.positionals?.[0], overlayOut); + assert.match(result.stdout, /Diff image:/); + assert.match(result.stdout, /Current overlay:/); + assert.match(result.stdout, /diff\.current-overlay\.png \(1 refs\)/); + assert.match( + result.stdout, + /size=large shape=large-area density=100% avgColor=#000000->#ffffff luminance=0->255/, + ); + assert.match(result.stdout, /overlaps @e1 "Continue", 12% of region/); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); }); diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index d7cd01c7f..49966dab5 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -4,6 +4,7 @@ import path from 'node:path'; import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts'; import { AppError } from '../../utils/errors.ts'; import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts'; +import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts'; import { resolveUserPath } from '../../utils/path-resolution.ts'; import { buildSelectionOptions, writeCommandOutput } from './shared.ts'; import type { ClientCommandHandler } from './router.ts'; @@ -71,6 +72,26 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl threshold: thresholdNum, outputPath, }); + if (flags.overlayRefs && !result.match && !result.dimensionMismatch) { + const overlayResult = await client.capture.screenshot({ + path: outputPath ? deriveCurrentOverlayPath(outputPath) : undefined, + overlayRefs: true, + }); + result = { + ...result, + currentOverlayPath: overlayResult.path, + ...(overlayResult.overlayRefs + ? { currentOverlayRefCount: overlayResult.overlayRefs.length } + : {}), + ...(result.regions && overlayResult.overlayRefs + ? { + regions: attachCurrentOverlayMatches(result.regions, overlayResult.overlayRefs), + } + : {}), + }; + } else if (flags.overlayRefs && outputPath) { + removeStaleCurrentOverlay(outputPath); + } } finally { try { fs.unlinkSync(currentPath); @@ -83,3 +104,21 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl writeCommandOutput(flags, result, () => formatScreenshotDiffText(result)); return true; }; + +function deriveCurrentOverlayPath(outputPath: string): string { + const extension = path.extname(outputPath); + const base = extension ? outputPath.slice(0, -extension.length) : outputPath; + return `${base}.current-overlay${extension || '.png'}`; +} + +function removeStaleCurrentOverlay(outputPath: string): void { + try { + fs.unlinkSync(deriveCurrentOverlayPath(outputPath)); + } catch (error) { + if (!isFsError(error, 'ENOENT')) throw error; + } +} + +function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException { + return typeof error === 'object' && error !== null && 'code' in error && error.code === code; +} diff --git a/src/utils/__tests__/output.test.ts b/src/utils/__tests__/output.test.ts index 715e802d3..b5372d97c 100644 --- a/src/utils/__tests__/output.test.ts +++ b/src/utils/__tests__/output.test.ts @@ -664,11 +664,110 @@ test('formatScreenshotDiffText renders mismatch with pixel counts without color' totalPixels: 10000, mismatchPercentage: 5, diffPath: '/tmp/test/diff.png', + currentOverlayPath: '/tmp/test/diff.current-overlay.png', + currentOverlayRefCount: 1, + regions: [ + { + index: 1, + rect: { x: 10, y: 20, width: 100, height: 40 }, + normalizedRect: { x: 10, y: 20, width: 100, height: 40 }, + differentPixels: 350, + shareOfDiffPercentage: 70, + densityPercentage: 8.75, + shape: 'horizontal-band', + size: 'medium', + location: 'top-left', + averageBaselineColorHex: '#141414', + averageCurrentColorHex: '#dcdcdc', + baselineLuminance: 20, + currentLuminance: 220, + dominantChange: 'brighter', + currentOverlayMatches: [ + { + ref: 'e1', + label: 'Continue', + rect: { x: 1, y: 2, width: 3, height: 4 }, + regionCoveragePercentage: 12, + }, + ], + }, + ], + ocr: { + provider: 'tesseract', + baselineBlocks: 2, + currentBlocks: 2, + matches: [ + { + text: 'Wi-Fi', + baselineRect: { x: 120, y: 320, width: 60, height: 22 }, + currentRect: { x: 130, y: 332, width: 70, height: 22 }, + delta: { x: 10, y: 12, width: 10, height: 0 }, + confidence: 94, + possibleTextMetricMismatch: true, + }, + ], + movementClusters: [ + { + texts: ['Wi-Fi', 'Bluetooth'], + xRange: { min: 10, max: 12 }, + yRange: { min: 10, max: 14 }, + }, + ], + }, + nonTextDeltas: [ + { + index: 1, + regionIndex: 1, + slot: 'leading', + likelyKind: 'icon', + rect: { x: 80, y: 318, width: 30, height: 30 }, + nearestText: 'Wi-Fi', + }, + { + index: 2, + regionIndex: 1, + slot: 'separator', + likelyKind: 'separator', + rect: { x: 90, y: 360, width: 120, height: 2 }, + }, + ], }), ); assert.match(text, /✗ 5% pixels differ/); assert.match(text, /Diff image:/); + assert.match(text, /Current overlay:/); + assert.match(text, /diff\.current-overlay\.png \(1 refs\)/); assert.match(text, /500 different \/ 10000 total pixels/); + assert.match(text, /Hints:/); + assert.match( + text, + /text movement cluster: "Wi-Fi", "Bluetooth" dx=\+10\.\.\+12px dy=\+10\.\.\+14px/, + ); + assert.match(text, /non-text controls: icon near "Wi-Fi" r1/); + assert.match(text, /non-text boundaries: separator r1/); + assert.match(text, /Changed regions:/); + assert.match(text, /1\. top-left x=10 y=20 100x40, 70% of diff, change=brighter/); + assert.match( + text, + /size=medium shape=horizontal-band density=8\.75% avgColor=#141414->#dcdcdc luminance=20->220/, + ); + assert.match(text, /overlaps @e1 "Continue", 12% of region/); + assert.match( + text, + /OCR text deltas \(tesseract; baselineBlocks=2 currentBlocks=2; showing 1\/1; px\):/, + ); + assert.match( + text, + /item \| text \| movePx \| sizeDeltaPx \| bboxBaseline \| bboxCurrent \| confidence \| issueHint/, + ); + assert.match( + text, + /1 \| "Wi-Fi" \| \+10,\+12 \| \+10,0 \| x=120,y=320,w=60,h=22 \| x=130,y=332,w=70,h=22 \| 94 \| ocr-bbox-size-change/, + ); + assert.match(text, /Non-text visual deltas \(showing 2\/2; px\):/); + assert.match(text, /item \| region \| slot \| kind \| bboxCurrent \| nearestText/); + assert.match(text, /1 \| r1 \| leading \| icon \| x=80,y=318,w=30,h=30 \| "Wi-Fi"/); + assert.match(text, /2 \| r1 \| separator \| separator \| x=90,y=360,w=120,h=2 \| -/); assert.equal(text.includes('\x1b['), false); }); diff --git a/src/utils/__tests__/screenshot-diff-non-text.test.ts b/src/utils/__tests__/screenshot-diff-non-text.test.ts new file mode 100644 index 000000000..e32fce5a9 --- /dev/null +++ b/src/utils/__tests__/screenshot-diff-non-text.test.ts @@ -0,0 +1,136 @@ +import assert from 'node:assert/strict'; +import { test } from 'vitest'; +import { summarizeNonTextDiffDeltas } from '../screenshot-diff-non-text.ts'; + +function paintMaskRect( + mask: Uint8Array, + imageWidth: number, + rect: { x: number; y: number; width: number; height: number }, +): void { + for (let y = rect.y; y < rect.y + rect.height; y += 1) { + for (let x = rect.x; x < rect.x + rect.width; x += 1) { + mask[y * imageWidth + x] = 1; + } + } +} + +test('summarizeNonTextDiffDeltas masks OCR text and reports leading icon residuals', () => { + const width = 220; + const height = 120; + const diffMask = new Uint8Array(width * height); + paintMaskRect(diffMask, width, { x: 20, y: 30, width: 20, height: 20 }); + paintMaskRect(diffMask, width, { x: 70, y: 32, width: 48, height: 12 }); + + const deltas = summarizeNonTextDiffDeltas({ + diffMask, + width, + height, + regions: [ + { + index: 1, + rect: { x: 0, y: 20, width: 180, height: 50 }, + normalizedRect: { x: 0, y: 16.67, width: 81.82, height: 41.67 }, + differentPixels: 976, + shareOfDiffPercentage: 100, + densityPercentage: 10.84, + shape: 'horizontal-band', + size: 'medium', + location: 'center', + averageBaselineColorHex: '#000000', + averageCurrentColorHex: '#ffffff', + baselineLuminance: 0, + currentLuminance: 255, + dominantChange: 'brighter', + }, + ], + ocr: { + provider: 'tesseract', + baselineBlocks: 1, + currentBlocks: 1, + baselineBlocksRaw: [], + currentBlocksRaw: [ + { + text: 'Wi-Fi', + confidence: 90, + rect: { x: 68, y: 28, width: 60, height: 24 }, + normalizedRect: { x: 30.91, y: 23.33, width: 27.27, height: 20 }, + }, + ], + matches: [], + }, + }); + + assert.equal(deltas.length, 1); + assert.equal(deltas[0]?.regionIndex, 1); + assert.equal(deltas[0]?.slot, 'leading'); + assert.equal(deltas[0]?.likelyKind, 'icon'); + assert.deepEqual(deltas[0]?.rect, { x: 20, y: 30, width: 20, height: 20 }); + assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); +}); + +test('summarizeNonTextDiffDeltas uses overlapping baseline text when current OCR misses a row', () => { + const width = 220; + const height = 120; + const diffMask = new Uint8Array(width * height); + paintMaskRect(diffMask, width, { x: 20, y: 30, width: 20, height: 20 }); + + const deltas = summarizeNonTextDiffDeltas({ + diffMask, + width, + height, + regions: [], + ocr: { + provider: 'tesseract', + baselineBlocks: 1, + currentBlocks: 0, + baselineBlocksRaw: [ + { + text: 'Wi-Fi', + confidence: 90, + rect: { x: 68, y: 28, width: 60, height: 24 }, + normalizedRect: { x: 30.91, y: 23.33, width: 27.27, height: 20 }, + }, + ], + currentBlocksRaw: [], + matches: [], + }, + }); + + assert.equal(deltas.length, 1); + assert.equal(deltas[0]?.slot, 'leading'); + assert.equal(deltas[0]?.likelyKind, 'icon'); + assert.equal(deltas[0]?.nearestText, 'Wi-Fi'); +}); + +test('summarizeNonTextDiffDeltas omits broad background residuals', () => { + const width = 220; + const height = 120; + const diffMask = new Uint8Array(width * height); + paintMaskRect(diffMask, width, { x: 10, y: 30, width: 180, height: 40 }); + + const deltas = summarizeNonTextDiffDeltas({ + diffMask, + width, + height, + regions: [ + { + index: 1, + rect: { x: 10, y: 30, width: 180, height: 40 }, + normalizedRect: { x: 4.55, y: 25, width: 81.82, height: 33.33 }, + differentPixels: 7200, + shareOfDiffPercentage: 100, + densityPercentage: 100, + shape: 'large-area', + size: 'large', + location: 'center', + averageBaselineColorHex: '#000000', + averageCurrentColorHex: '#ffffff', + baselineLuminance: 0, + currentLuminance: 255, + dominantChange: 'brighter', + }, + ], + }); + + assert.deepEqual(deltas, []); +}); diff --git a/src/utils/__tests__/screenshot-diff-ocr.test.ts b/src/utils/__tests__/screenshot-diff-ocr.test.ts new file mode 100644 index 000000000..6f6204dea --- /dev/null +++ b/src/utils/__tests__/screenshot-diff-ocr.test.ts @@ -0,0 +1,130 @@ +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { test } from 'vitest'; +import { + matchOcrBlocks, + parseTesseractTsv, + summarizeScreenshotOcr, + summarizeOcrMovementClusters, +} from '../screenshot-diff-ocr.ts'; + +test('parseTesseractTsv groups word rows into text line blocks', () => { + const blocks = parseTesseractTsv( + [ + 'level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext', + '5\t1\t1\t1\t1\t1\t100\t200\t40\t20\t96\tAirplane', + '5\t1\t1\t1\t1\t2\t150\t200\t30\t20\t94\tMode', + '5\t1\t1\t1\t1\t3\t300\t200\t90\t20\t92\tDisconnected', + '5\t1\t1\t1\t2\t1\t100\t240\t50\t20\t90\tWi-Fi', + '5\t1\t1\t1\t3\t1\t100\t280\t10\t20\t-1\t', + ].join('\n'), + 400, + 800, + ); + + assert.equal(blocks.length, 3); + assert.deepEqual(blocks[0], { + text: 'Airplane Mode', + confidence: 95, + rect: { x: 100, y: 200, width: 80, height: 20 }, + normalizedRect: { x: 25, y: 25, width: 20, height: 2.5 }, + }); + assert.deepEqual(blocks[1], { + text: 'Disconnected', + confidence: 92, + rect: { x: 300, y: 200, width: 90, height: 20 }, + normalizedRect: { x: 75, y: 25, width: 22.5, height: 2.5 }, + }); + assert.deepEqual(blocks[2], { + text: 'Wi-Fi', + confidence: 90, + rect: { x: 100, y: 240, width: 50, height: 20 }, + normalizedRect: { x: 25, y: 30, width: 12.5, height: 2.5 }, + }); +}); + +test('matchOcrBlocks reports movement and OCR bbox size change', () => { + const matches = matchOcrBlocks( + [ + { + text: 'Wi-Fi', + confidence: 96, + rect: { x: 100, y: 200, width: 50, height: 20 }, + normalizedRect: { x: 25, y: 25, width: 12.5, height: 2.5 }, + }, + ], + [ + { + text: 'Wi-Fi', + confidence: 94, + rect: { x: 112, y: 192, width: 60, height: 20 }, + normalizedRect: { x: 28, y: 24, width: 15, height: 2.5 }, + }, + ], + ); + + assert.equal(matches.length, 1); + assert.deepEqual(matches[0]?.delta, { x: 12, y: -8, width: 10, height: 0 }); + assert.equal(matches[0]?.possibleTextMetricMismatch, true); +}); + +test('summarizeOcrMovementClusters groups repeated x-axis text movement', () => { + const clusters = summarizeOcrMovementClusters([ + { + text: 'Wi-Fi', + baselineRect: { x: 100, y: 200, width: 50, height: 20 }, + currentRect: { x: 286, y: 120, width: 50, height: 20 }, + delta: { x: 186, y: -80, width: 0, height: 0 }, + confidence: 96, + possibleTextMetricMismatch: false, + }, + { + text: 'Bluetooth', + baselineRect: { x: 100, y: 260, width: 90, height: 20 }, + currentRect: { x: 284, y: 190, width: 90, height: 20 }, + delta: { x: 184, y: -70, width: 0, height: 0 }, + confidence: 90, + possibleTextMetricMismatch: false, + }, + { + text: 'Search', + baselineRect: { x: 100, y: 500, width: 90, height: 20 }, + currentRect: { x: 52, y: 560, width: 90, height: 20 }, + delta: { x: -48, y: 60, width: 0, height: 0 }, + confidence: 94, + possibleTextMetricMismatch: false, + }, + ]); + + assert.equal(clusters.length, 1); + assert.deepEqual(clusters[0]?.texts, ['Wi-Fi', 'Bluetooth']); + assert.deepEqual(clusters[0]?.xRange, { min: 184, max: 186 }); + assert.deepEqual(clusters[0]?.yRange, { min: -80, max: -70 }); +}); + +test('summarizeScreenshotOcr returns undefined when tesseract exits non-zero', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'ocr-test-')); + const binDir = path.join(dir, 'bin'); + const fakeTesseract = path.join(binDir, 'tesseract'); + const originalPath = process.env.PATH; + fs.mkdirSync(binDir, { recursive: true }); + fs.writeFileSync(fakeTesseract, '#!/bin/sh\nexit 2\n'); + fs.chmodSync(fakeTesseract, 0o755); + process.env.PATH = `${binDir}${path.delimiter}${originalPath ?? ''}`; + + try { + const result = await summarizeScreenshotOcr({ + baselinePath: path.join(dir, 'baseline.png'), + currentPath: path.join(dir, 'current.png'), + width: 100, + height: 100, + }); + assert.equal(result, undefined); + } finally { + if (originalPath === undefined) delete process.env.PATH; + else process.env.PATH = originalPath; + fs.rmSync(dir, { recursive: true, force: true }); + } +}); diff --git a/src/utils/__tests__/screenshot-diff.test.ts b/src/utils/__tests__/screenshot-diff.test.ts index bc1335281..06f4d4680 100644 --- a/src/utils/__tests__/screenshot-diff.test.ts +++ b/src/utils/__tests__/screenshot-diff.test.ts @@ -27,6 +27,22 @@ function writeSolidPng( fs.writeFileSync(filePath, PNG.sync.write(png)); } +function paintRect( + png: PNG, + rect: { x: number; y: number; width: number; height: number }, + color: { r: number; g: number; b: number }, +): void { + for (let y = rect.y; y < rect.y + rect.height; y += 1) { + for (let x = rect.x; x < rect.x + rect.width; x += 1) { + const index = (y * png.width + x) * 4; + png.data[index] = color.r; + png.data[index + 1] = color.g; + png.data[index + 2] = color.b; + png.data[index + 3] = 255; + } + } +} + test('identical images produce match: true with 0% mismatch', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); @@ -85,6 +101,115 @@ test('completely different images produce match: false with 100% mismatch', asyn assert.ok(fs.existsSync(diffOut), 'diff image should be written'); }); +test('changed pixels are summarized into nearby diff regions', async () => { + const dir = tmpDir(); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + const diffOut = path.join(dir, 'diff.png'); + + writeSolidPng(baseline, 40, 20, { r: 0, g: 0, b: 0 }); + + const currentPng = new PNG({ width: 40, height: 20 }); + for (let i = 0; i < currentPng.data.length; i += 4) { + currentPng.data[i] = 0; + currentPng.data[i + 1] = 0; + currentPng.data[i + 2] = 0; + currentPng.data[i + 3] = 255; + } + paintRect(currentPng, { x: 2, y: 2, width: 4, height: 4 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 10, y: 2, width: 4, height: 4 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 30, y: 15, width: 4, height: 4 }, { r: 255, g: 255, b: 255 }); + fs.writeFileSync(current, PNG.sync.write(currentPng)); + + const result = await compareScreenshots(baseline, current, { + outputPath: diffOut, + threshold: 0, + }); + + assert.equal(result.differentPixels, 48); + assert.equal(result.regions?.length, 2); + assert.deepEqual(result.regions?.[0]?.rect, { x: 2, y: 2, width: 12, height: 4 }); + assert.equal(result.regions?.[0]?.differentPixels, 32); + assert.equal(result.regions?.[0]?.shareOfDiffPercentage, 66.67); + assert.deepEqual(result.regions?.[0]?.normalizedRect, { x: 5, y: 10, width: 30, height: 20 }); + assert.equal(result.regions?.[0]?.densityPercentage, 66.67); + assert.equal(result.regions?.[0]?.shape, 'horizontal-band'); + assert.equal(result.regions?.[0]?.size, 'large'); + assert.equal(result.regions?.[0]?.averageBaselineColorHex, '#000000'); + assert.equal(result.regions?.[0]?.averageCurrentColorHex, '#ffffff'); + assert.equal(result.regions?.[0]?.baselineLuminance, 0); + assert.equal(result.regions?.[0]?.currentLuminance, 255); + assert.equal(result.regions?.[0]?.location, 'top-left'); + assert.equal(result.regions?.[0]?.dominantChange, 'brighter'); + assert.deepEqual(result.regions?.[1]?.rect, { x: 30, y: 15, width: 4, height: 4 }); + + const diffPng = PNG.sync.read(fs.readFileSync(diffOut)); + const borderPixel = (2 * diffPng.width + 2) * 4; + assert.equal(diffPng.data[borderPixel], 0); + assert.equal(diffPng.data[borderPixel + 1], 187); + assert.equal(diffPng.data[borderPixel + 2], 255); +}); + +test('large connected diff regions are split at horizontal low-density bands', async () => { + const dir = tmpDir(); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + + writeSolidPng(baseline, 100, 220, { r: 0, g: 0, b: 0 }); + + const currentPng = new PNG({ width: 100, height: 220 }); + for (let i = 0; i < currentPng.data.length; i += 4) { + currentPng.data[i] = 0; + currentPng.data[i + 1] = 0; + currentPng.data[i + 2] = 0; + currentPng.data[i + 3] = 255; + } + paintRect(currentPng, { x: 0, y: 0, width: 100, height: 80 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 50, y: 80, width: 1, height: 50 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 0, y: 130, width: 100, height: 90 }, { r: 255, g: 255, b: 255 }); + fs.writeFileSync(current, PNG.sync.write(currentPng)); + + const result = await compareScreenshots(baseline, current, { + outputPath: path.join(dir, 'diff.png'), + threshold: 0, + }); + + assert.equal(result.regions?.length, 2); + const rectsByTop = result.regions?.map((region) => region.rect).sort((a, b) => a.y - b.y); + assert.deepEqual(rectsByTop, [ + { x: 0, y: 0, width: 100, height: 106 }, + { x: 0, y: 106, width: 100, height: 114 }, + ]); +}); + +test('large connected diff regions are not split at short low-density bands', async () => { + const dir = tmpDir(); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + + writeSolidPng(baseline, 100, 220, { r: 0, g: 0, b: 0 }); + + const currentPng = new PNG({ width: 100, height: 220 }); + for (let i = 0; i < currentPng.data.length; i += 4) { + currentPng.data[i] = 0; + currentPng.data[i + 1] = 0; + currentPng.data[i + 2] = 0; + currentPng.data[i + 3] = 255; + } + paintRect(currentPng, { x: 0, y: 0, width: 100, height: 80 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 50, y: 80, width: 1, height: 4 }, { r: 255, g: 255, b: 255 }); + paintRect(currentPng, { x: 0, y: 84, width: 100, height: 136 }, { r: 255, g: 255, b: 255 }); + fs.writeFileSync(current, PNG.sync.write(currentPng)); + + const result = await compareScreenshots(baseline, current, { + outputPath: path.join(dir, 'diff.png'), + threshold: 0, + }); + + assert.equal(result.regions?.length, 1); + assert.deepEqual(result.regions?.[0]?.rect, { x: 0, y: 0, width: 100, height: 220 }); +}); + test('no diff path is persisted when outputPath is omitted', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); @@ -99,7 +224,7 @@ test('no diff path is persisted when outputPath is omitted', async () => { assert.equal(result.diffPath, undefined); }); -test('diff image marks different pixels as red and unchanged as dimmed gray', async () => { +test('diff image marks changed pixels red over a light current-screen context', async () => { const dir = tmpDir(); const baseline = path.join(dir, 'baseline.png'); const current = path.join(dir, 'current.png'); @@ -143,16 +268,15 @@ test('diff image marks different pixels as red and unchanged as dimmed gray', as // Read the diff image and verify pixel colors const diffPng = PNG.sync.read(fs.readFileSync(diffOut)); - // Pixel 0 (unchanged white): should be dimmed gray - // gray = round((255+255+255)/3) = 255, dimmed = round(255*0.3) = 77 - assert.equal(diffPng.data[0], 77); // R - assert.equal(diffPng.data[1], 77); // G - assert.equal(diffPng.data[2], 77); // B + // Pixel 0 (unchanged white): should stay light as screenshot context. + assert.equal(diffPng.data[0], 255); // R + assert.equal(diffPng.data[1], 255); // G + assert.equal(diffPng.data[2], 255); // B - // Pixel 1 (different): should be red - assert.equal(diffPng.data[4], 255); // R - assert.equal(diffPng.data[5], 0); // G - assert.equal(diffPng.data[6], 0); // B + // Pixel 1 (different): should be red-tinted while preserving light context. + assert.equal(diffPng.data[4], 228); // R + assert.equal(diffPng.data[5], 56); // G + assert.equal(diffPng.data[6], 56); // B }); test('dimension mismatch returns expected vs actual sizes', async () => { @@ -169,6 +293,9 @@ test('dimension mismatch returns expected vs actual sizes', async () => { assert.equal(result.match, false); assert.equal(result.mismatchPercentage, 100); assert.equal(result.diffPath, undefined, 'diffPath should not be set for dimension mismatch'); + assert.equal(result.regions, undefined); + assert.equal(result.ocr, undefined); + assert.equal(result.nonTextDeltas, undefined); assert.deepEqual(result.dimensionMismatch, { expected: { width: 10, height: 20 }, actual: { width: 15, height: 25 }, diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 944d90a6e..f74d25b51 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -886,7 +886,7 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ type: 'boolean', usageLabel: '--overlay-refs', usageDescription: - 'Screenshot: draw current snapshot refs and target rectangles onto the saved PNG', + 'Screenshot: draw current snapshot refs and target rectangles onto the saved PNG; diff screenshot: also write a separate current-screen overlay guide', }, { key: 'screenshotFullscreen', @@ -993,11 +993,11 @@ const COMMAND_SCHEMAS: Record = { }, diff: { usageOverride: - 'diff snapshot | diff screenshot --baseline [--out ] [--threshold <0-1>]', + 'diff snapshot | diff screenshot --baseline [--out ] [--threshold <0-1>] [--overlay-refs]', helpDescription: 'Diff accessibility snapshot or compare screenshots pixel-by-pixel', summary: 'Diff snapshot or screenshot', positionalArgs: ['kind'], - allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out'], + allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out', 'overlayRefs'], }, 'ensure-simulator': { helpDescription: 'Ensure an iOS simulator exists in a device set (create if missing)', diff --git a/src/utils/output.ts b/src/utils/output.ts index da2f339c8..56fd7f6a5 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -4,6 +4,7 @@ import { buildSnapshotDisplayLines, formatSnapshotLine } from './snapshot-lines. import type { SnapshotNode, SnapshotVisibility } from './snapshot.ts'; import { displayNodeLabel } from './snapshot-tree.ts'; import type { ScreenshotDiffResult } from './screenshot-diff.ts'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; import { styleText } from 'node:util'; import { buildMobileSnapshotPresentation } from './mobile-snapshot-semantics.ts'; @@ -227,14 +228,168 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { lines.push(` ${label} ${displayPath}`); } + if (data.currentOverlayPath && !match) { + const relativePath = toRelativePath(data.currentOverlayPath); + const label = useColor ? colorize('Current overlay:', 'dim') : 'Current overlay:'; + const displayPath = useColor ? colorize(relativePath, 'green') : relativePath; + const refCount = toNumber(data.currentOverlayRefCount); + const refSuffix = refCount > 0 ? ` (${refCount} refs)` : ''; + lines.push(` ${label} ${displayPath}${refSuffix}`); + } + if (!match && !dimensionMismatch) { const diffCount = useColor ? colorize(String(differentPixels), 'red') : String(differentPixels); lines.push(` ${diffCount} different / ${totalPixels} total pixels`); } + const hints = !match && !dimensionMismatch ? formatScreenshotDiffHints(data) : []; + if (hints.length > 0) { + lines.push(' Hints:'); + for (const hint of hints) lines.push(` - ${hint}`); + } + + const regions = Array.isArray(data.regions) ? data.regions : []; + if (!match && !dimensionMismatch && regions.length > 0) { + lines.push(' Changed regions:'); + for (const region of regions.slice(0, 5)) { + const share = + region.shareOfDiffPercentage === 0 && region.differentPixels > 0 + ? '<0.01' + : String(region.shareOfDiffPercentage); + const rect = region.rect; + lines.push( + ` ${region.index}. ${region.location} x=${rect.x} y=${rect.y} ` + + `${rect.width}x${rect.height}, ${share}% of diff, change=${region.dominantChange}`, + ); + const detailLine = formatScreenshotRegionDetails(region); + if (detailLine) { + lines.push(` ${detailLine}`); + } + const bestMatch = region.currentOverlayMatches?.[0]; + if (bestMatch) { + const label = bestMatch.label ? ` "${bestMatch.label}"` : ''; + lines.push( + ` overlaps @${bestMatch.ref}${label}, ` + + `${bestMatch.regionCoveragePercentage}% of region`, + ); + } + } + } + + const ocrMatches = data.ocr?.matches ?? []; + if (!match && !dimensionMismatch && ocrMatches.length > 0) { + const shownOcrMatches = ocrMatches.slice(0, 8); + lines.push( + ` OCR text deltas (${data.ocr?.provider}; baselineBlocks=${data.ocr?.baselineBlocks} ` + + `currentBlocks=${data.ocr?.currentBlocks}; showing ${shownOcrMatches.length}/${ocrMatches.length}; px):`, + ); + lines.push( + ' item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | confidence | issueHint', + ); + for (const [index, ocrMatch] of shownOcrMatches.entries()) { + const delta = ocrMatch.delta; + lines.push( + ` ${index + 1} | ${JSON.stringify(ocrMatch.text)} | ` + + `${formatSignedPixels(delta.x)},${formatSignedPixels(delta.y)} | ` + + `${formatSignedPixels(delta.width)},${formatSignedPixels(delta.height)} | ` + + `${formatRect(ocrMatch.baselineRect)} | ${formatRect(ocrMatch.currentRect)} | ` + + `${ocrMatch.confidence} | ` + + `${ocrMatch.possibleTextMetricMismatch ? 'ocr-bbox-size-change' : '-'}`, + ); + } + } + + const nonTextDeltas = data.nonTextDeltas ?? []; + if (!match && !dimensionMismatch && nonTextDeltas.length > 0) { + const shownNonTextDeltas = nonTextDeltas.slice(0, 8); + lines.push( + ` Non-text visual deltas (showing ${shownNonTextDeltas.length}/${nonTextDeltas.length}; px):`, + ); + lines.push(' item | region | slot | kind | bboxCurrent | nearestText'); + for (const delta of shownNonTextDeltas) { + lines.push( + ` ${delta.index} | ${delta.regionIndex ? `r${delta.regionIndex}` : '-'} | ` + + `${delta.slot} | ${delta.likelyKind} | ${formatRect(delta.rect)} | ` + + `${delta.nearestText ? JSON.stringify(delta.nearestText) : '-'}`, + ); + } + } + return `${lines.join('\n')}\n`; } +function formatRect(rect: { x: number; y: number; width: number; height: number }): string { + return `x=${rect.x},y=${rect.y},w=${rect.width},h=${rect.height}`; +} + +function formatSignedPixels(value: number): string { + return value > 0 ? `+${value}` : String(value); +} + +function formatScreenshotDiffHints(data: ScreenshotDiffResult): string[] { + const hints: string[] = []; + const clusters = data.ocr?.movementClusters ?? []; + for (const cluster of clusters.slice(0, 2)) { + hints.push( + `text movement cluster: ${formatQuotedList(cluster.texts)} dx=${formatRange(cluster.xRange)}px ` + + `dy=${formatRange(cluster.yRange)}px`, + ); + } + + const controlDeltas = (data.nonTextDeltas ?? []) + .filter((delta) => ['icon', 'toggle', 'chevron'].includes(delta.likelyKind)) + .slice(0, 3); + if (controlDeltas.length > 0) { + hints.push(`non-text controls: ${controlDeltas.map(formatNonTextHint).join('; ')}`); + } + + const boundaryDeltas = (data.nonTextDeltas ?? []) + .filter((delta) => delta.likelyKind === 'separator') + .slice(0, 2); + if (boundaryDeltas.length > 0) { + hints.push(`non-text boundaries: ${boundaryDeltas.map(formatNonTextHint).join('; ')}`); + } + + return hints.slice(0, 6); +} + +function formatNonTextHint(delta: { + likelyKind: string; + nearestText?: string; + regionIndex?: number; +}): string { + const anchor = delta.nearestText ? ` near ${JSON.stringify(delta.nearestText)}` : ''; + const region = delta.regionIndex ? ` r${delta.regionIndex}` : ''; + return `${delta.likelyKind}${anchor}${region}`; +} + +function formatRange(range: { min: number; max: number }): string { + return range.min === range.max + ? formatSignedPixels(range.min) + : `${formatSignedPixels(range.min)}..${formatSignedPixels(range.max)}`; +} + +function formatQuotedList(values: string[]): string { + const shown = values.slice(0, 4).map((value) => JSON.stringify(value)); + const suffix = values.length > shown.length ? ` +${values.length - shown.length} more` : ''; + return `${shown.join(', ')}${suffix}`; +} + +function formatScreenshotRegionDetails(region: ScreenshotDiffRegion): string | null { + const details = [ + region.size ? `size=${region.size}` : null, + region.shape ? `shape=${region.shape}` : null, + typeof region.densityPercentage === 'number' ? `density=${region.densityPercentage}%` : null, + region.averageBaselineColorHex && region.averageCurrentColorHex + ? `avgColor=${region.averageBaselineColorHex}->${region.averageCurrentColorHex}` + : null, + typeof region.baselineLuminance === 'number' && typeof region.currentLuminance === 'number' + ? `luminance=${region.baselineLuminance}->${region.currentLuminance}` + : null, + ].filter((entry): entry is string => entry !== null); + return details.length > 0 ? details.join(' ') : null; +} + function toRelativePath(filePath: string): string { const cwd = process.cwd(); const relativePath = path.relative(cwd, filePath); diff --git a/src/utils/screenshot-diff-non-text.ts b/src/utils/screenshot-diff-non-text.ts new file mode 100644 index 000000000..7131a6537 --- /dev/null +++ b/src/utils/screenshot-diff-non-text.ts @@ -0,0 +1,499 @@ +import type { Rect } from './snapshot.ts'; +import type { ScreenshotOcrAnalysis, ScreenshotOcrBlock } from './screenshot-diff-ocr.ts'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; + +export type ScreenshotNonTextDelta = { + index: number; + regionIndex?: number; + slot: 'leading' | 'trailing' | 'background' | 'separator' | 'unknown'; + likelyKind: 'icon' | 'toggle' | 'chevron' | 'separator' | 'visual'; + rect: Rect; + nearestText?: string; +}; + +type NonTextKind = ScreenshotNonTextDelta['likelyKind'] | 'background'; + +const MAX_NON_TEXT_DELTAS = 12; +const OCR_MASK_PADDING_PX = 8; +const MIN_COMPONENT_PIXELS = 24; +const MIN_COMPONENT_SIDE = 3; +const MERGE_GAP_PX = 10; +const MIN_CONTENT_Y_RATIO = 0.08; +// Non-text hints classify residual geometry relative to the screenshot size. +// Aspect/density checks describe common UI glyph shapes rather than app-specific elements. +const SEPARATOR_MAX_THICKNESS_PX = 3; +const SEPARATOR_MIN_WIDTH_RATIO = 0.12; +const BACKGROUND_SLOT_WIDTH_RATIO = 0.4; +const UNKNOWN_BACKGROUND_SLOT_WIDTH_RATIO = 0.35; +const LARGE_RESIDUAL_WIDTH_RATIO = 0.25; +const LARGE_RESIDUAL_HEIGHT_RATIO = 0.06; +const TOGGLE_MIN_ASPECT_RATIO = 1.5; +const TOGGLE_MAX_ASPECT_RATIO = 3.8; +const TOGGLE_MIN_DENSITY_RATIO = 0.35; +const CHEVRON_MAX_WIDTH_RATIO = 0.06; +const CHEVRON_MAX_HEIGHT_RATIO = 0.04; +const ICON_MIN_ASPECT_RATIO = 0.55; +const ICON_MAX_ASPECT_RATIO = 1.8; +const LARGE_RESIDUAL_SCORE_PENALTY = -35; +const REGION_OVERLAP_SCORE = 20; +const MAX_PIXEL_COUNT_SCORE = 20; +const PIXELS_PER_SCORE_POINT = 200; +const KIND_SCORE = { + icon: 90, + toggle: 90, + chevron: 75, + separator: 45, + visual: 35, + background: 10, +} satisfies Record; +const SLOT_SCORE = { + leading: 20, + trailing: 20, + separator: 10, + unknown: 0, + background: -30, +} satisfies Record; + +type MutableComponent = { + minX: number; + minY: number; + maxX: number; + maxY: number; + differentPixels: number; +}; + +type ScoredNonTextDelta = Omit & { + likelyKind: NonTextKind; + score: number; +}; + +type OcrRow = { + rect: Rect; + blocks: ScreenshotOcrBlock[]; +}; + +export function summarizeNonTextDiffDeltas(params: { + diffMask: Uint8Array; + width: number; + height: number; + regions: ScreenshotDiffRegion[]; + ocr?: ScreenshotOcrAnalysis; + maxDeltas?: number; +}): ScreenshotNonTextDelta[] { + const maskedDiff = maskOcrText(params.diffMask, params.width, params.height, params.ocr); + const rawComponents = findConnectedComponents(maskedDiff, params.width, params.height); + const mergedComponents = mergeNearbyComponents(rawComponents, MERGE_GAP_PX); + const currentRows = groupOcrRows(params.ocr?.currentBlocksRaw ?? []); + const baselineRows = groupOcrRows(params.ocr?.baselineBlocksRaw ?? []); + return ( + mergedComponents + .filter(hasUsefulComponentSize) + .map((component) => toNonTextDelta(component, params, currentRows, baselineRows)) + // Status bars and top chrome tend to produce noisy residuals around time, + // signal, and battery text; changed regions still report that area. + .filter((delta) => delta.rect.y >= params.height * MIN_CONTENT_Y_RATIO) + .filter(hasAgentFacingKind) + .sort((left, right) => right.score - left.score) + .slice(0, Math.max(0, params.maxDeltas ?? MAX_NON_TEXT_DELTAS)) + .map((delta, index) => toPublicNonTextDelta(delta, index + 1)) + ); +} + +function maskOcrText( + diffMask: Uint8Array, + width: number, + height: number, + ocr: ScreenshotOcrAnalysis | undefined, +): Uint8Array { + const maskedDiff = new Uint8Array(diffMask); + if (!ocr) return maskedDiff; + for (const block of [...ocr.baselineBlocksRaw, ...ocr.currentBlocksRaw]) { + clearRect(maskedDiff, width, height, expandRect(block.rect, OCR_MASK_PADDING_PX)); + } + return maskedDiff; +} + +function findConnectedComponents( + mask: Uint8Array, + width: number, + height: number, +): MutableComponent[] { + const visited = new Uint8Array(mask.length); + const queue = new Int32Array(mask.length); + const components: MutableComponent[] = []; + for (let pixelIndex = 0; pixelIndex < mask.length; pixelIndex += 1) { + if (mask[pixelIndex] !== 1 || visited[pixelIndex] === 1) continue; + let queueStart = 0; + let queueEnd = 0; + queue[queueEnd] = pixelIndex; + queueEnd += 1; + visited[pixelIndex] = 1; + + const startX = pixelIndex % width; + const startY = Math.floor(pixelIndex / width); + const component: MutableComponent = { + minX: startX, + minY: startY, + maxX: startX, + maxY: startY, + differentPixels: 0, + }; + + while (queueStart < queueEnd) { + const currentIndex = queue[queueStart]!; + queueStart += 1; + const x = currentIndex % width; + const y = Math.floor(currentIndex / width); + component.minX = Math.min(component.minX, x); + component.minY = Math.min(component.minY, y); + component.maxX = Math.max(component.maxX, x); + component.maxY = Math.max(component.maxY, y); + component.differentPixels += 1; + + for (let yOffset = -1; yOffset <= 1; yOffset += 1) { + const neighborY = y + yOffset; + if (neighborY < 0 || neighborY >= height) continue; + for (let xOffset = -1; xOffset <= 1; xOffset += 1) { + if (xOffset === 0 && yOffset === 0) continue; + const neighborX = x + xOffset; + if (neighborX < 0 || neighborX >= width) continue; + const neighborIndex = neighborY * width + neighborX; + if (mask[neighborIndex] !== 1 || visited[neighborIndex] === 1) continue; + visited[neighborIndex] = 1; + queue[queueEnd] = neighborIndex; + queueEnd += 1; + } + } + } + components.push(component); + } + return components; +} + +function mergeNearbyComponents(components: MutableComponent[], gapPx: number): MutableComponent[] { + const merged: MutableComponent[] = []; + for (const component of components.sort( + (left, right) => left.minY - right.minY || left.minX - right.minX, + )) { + const existing = merged.find((candidate) => componentsAreNear(candidate, component, gapPx)); + if (!existing) { + merged.push({ ...component }); + continue; + } + existing.minX = Math.min(existing.minX, component.minX); + existing.minY = Math.min(existing.minY, component.minY); + existing.maxX = Math.max(existing.maxX, component.maxX); + existing.maxY = Math.max(existing.maxY, component.maxY); + existing.differentPixels += component.differentPixels; + } + return merged; +} + +function toNonTextDelta( + component: MutableComponent, + params: { + width: number; + height: number; + regions: ScreenshotDiffRegion[]; + }, + currentRows: OcrRow[], + baselineRows: OcrRow[], +): ScoredNonTextDelta { + const rect = componentToRect(component); + const regionIndex = findContainingRegionIndex(rect, params.regions); + const textAnchor = findTextAnchor(rect, currentRows, baselineRows); + const slot = classifySlot(rect, textAnchor?.block.rect, params.width); + const likelyKind = classifyLikelyKind(rect, slot, component.differentPixels, params); + const scoreParams = { + ...(regionIndex ? { regionIndex } : {}), + slot, + likelyKind, + rect, + }; + return { + ...(regionIndex ? { regionIndex } : {}), + slot, + likelyKind, + rect, + ...(textAnchor ? { nearestText: cleanOcrAnchorText(textAnchor.block.text) } : {}), + score: scoreNonTextDelta(scoreParams, component.differentPixels, params), + }; +} + +function toPublicNonTextDelta( + delta: ScoredNonTextDelta & { likelyKind: ScreenshotNonTextDelta['likelyKind'] }, + index: number, +): ScreenshotNonTextDelta { + return { + index, + ...(delta.regionIndex ? { regionIndex: delta.regionIndex } : {}), + slot: delta.slot, + likelyKind: delta.likelyKind, + rect: delta.rect, + ...(delta.nearestText ? { nearestText: delta.nearestText } : {}), + }; +} + +function classifySlot( + rect: Rect, + nearestTextRect: Rect | undefined, + imageWidth: number, +): ScreenshotNonTextDelta['slot'] { + if ( + rect.height <= SEPARATOR_MAX_THICKNESS_PX && + rect.width >= imageWidth * SEPARATOR_MIN_WIDTH_RATIO + ) { + return 'separator'; + } + if (!nearestTextRect) { + if (rect.width >= imageWidth * BACKGROUND_SLOT_WIDTH_RATIO) return 'background'; + return 'unknown'; + } + if (rect.width >= imageWidth * BACKGROUND_SLOT_WIDTH_RATIO) return 'background'; + const rectCenterX = rect.x + rect.width / 2; + const textCenterX = nearestTextRect.x + nearestTextRect.width / 2; + if (rectCenterX < textCenterX - nearestTextRect.width / 2) return 'leading'; + if (rectCenterX > textCenterX + nearestTextRect.width / 2) return 'trailing'; + return rect.width >= imageWidth * UNKNOWN_BACKGROUND_SLOT_WIDTH_RATIO ? 'background' : 'unknown'; +} + +function classifyLikelyKind( + rect: Rect, + slot: ScreenshotNonTextDelta['slot'], + differentPixels: number, + image: { width: number; height: number }, +): NonTextKind { + const aspect = rect.width / rect.height; + const density = differentPixels / (rect.width * rect.height); + if (slot === 'separator') return 'separator'; + if (slot === 'background') return 'background'; + if ( + slot === 'trailing' && + aspect >= TOGGLE_MIN_ASPECT_RATIO && + aspect <= TOGGLE_MAX_ASPECT_RATIO && + density >= TOGGLE_MIN_DENSITY_RATIO + ) { + return 'toggle'; + } + if ( + slot === 'trailing' && + rect.width <= image.width * CHEVRON_MAX_WIDTH_RATIO && + rect.height <= image.height * CHEVRON_MAX_HEIGHT_RATIO + ) { + return 'chevron'; + } + if (slot === 'leading' && aspect >= ICON_MIN_ASPECT_RATIO && aspect <= ICON_MAX_ASPECT_RATIO) { + return 'icon'; + } + if (isLargeResidual(rect, image)) return 'background'; + return 'visual'; +} + +function hasAgentFacingKind( + delta: ScoredNonTextDelta, +): delta is ScoredNonTextDelta & { likelyKind: ScreenshotNonTextDelta['likelyKind'] } { + return delta.likelyKind !== 'background'; +} + +function scoreNonTextDelta( + delta: { + regionIndex?: number; + slot: ScreenshotNonTextDelta['slot']; + likelyKind: NonTextKind; + rect: Rect; + }, + differentPixels: number, + image: { width: number; height: number }, +): number { + const sizePenalty = isLargeResidual(delta.rect, image) ? LARGE_RESIDUAL_SCORE_PENALTY : 0; + const regionScore = delta.regionIndex ? REGION_OVERLAP_SCORE : 0; + return ( + KIND_SCORE[delta.likelyKind] + + SLOT_SCORE[delta.slot] + + regionScore + + sizePenalty + + Math.min(MAX_PIXEL_COUNT_SCORE, differentPixels / PIXELS_PER_SCORE_POINT) + ); +} + +function isLargeResidual(rect: Rect, image: { width: number; height: number }): boolean { + return ( + rect.width >= image.width * LARGE_RESIDUAL_WIDTH_RATIO || + rect.height >= image.height * LARGE_RESIDUAL_HEIGHT_RATIO + ); +} + +function findContainingRegionIndex( + rect: Rect, + regions: ScreenshotDiffRegion[], +): number | undefined { + let bestRegion: ScreenshotDiffRegion | undefined; + let bestOverlap = 0; + for (const region of regions) { + const overlap = intersectArea(rect, region.rect); + if (overlap <= bestOverlap) continue; + bestOverlap = overlap; + bestRegion = region; + } + return bestRegion?.index; +} + +function findTextAnchor( + rect: Rect, + currentRows: OcrRow[], + baselineRows: OcrRow[], +): { block: ScreenshotOcrBlock; distance: number } | undefined { + const currentRow = findOverlappingRow(rect, currentRows); + if (currentRow) return findNearestText(rect, currentRow.blocks); + const baselineRow = findOverlappingRow(rect, baselineRows); + return baselineRow ? findNearestText(rect, baselineRow.blocks) : undefined; +} + +function findOverlappingRow(rect: Rect, rows: OcrRow[]): OcrRow | undefined { + let bestRow: OcrRow | undefined; + let bestOverlap = 0; + for (const row of rows) { + const overlap = verticalOverlap(rect, row.rect); + if (overlap <= bestOverlap) continue; + bestOverlap = overlap; + bestRow = row; + } + return bestRow; +} + +function groupOcrRows(blocks: ScreenshotOcrBlock[]): OcrRow[] { + const rows: OcrRow[] = []; + for (const block of [...blocks].sort((left, right) => left.rect.y - right.rect.y)) { + const row = rows.find((candidate) => blocksShareRow(candidate.rect, block.rect)); + if (!row) { + rows.push({ rect: block.rect, blocks: [block] }); + continue; + } + row.blocks.push(block); + row.blocks.sort((left, right) => left.rect.x - right.rect.x); + row.rect = unionRects([row.rect, block.rect]); + } + return rows; +} + +function blocksShareRow(left: Rect, right: Rect): boolean { + const overlap = verticalOverlap(left, right); + if (overlap > 0) return true; + const centerDistance = Math.abs(rectCenter(left).y - rectCenter(right).y); + return centerDistance <= Math.max(left.height, right.height) * 0.5; +} + +function findNearestText( + rect: Rect, + textBlocks: ScreenshotOcrBlock[], +): { block: ScreenshotOcrBlock; distance: number } | undefined { + let nearest: { block: ScreenshotOcrBlock; distance: number } | undefined; + const center = rectCenter(rect); + for (const block of textBlocks) { + const distance = Math.sqrt(squaredDistance(center, rectCenter(block.rect))); + if (nearest && distance >= nearest.distance) continue; + nearest = { block, distance }; + } + return nearest; +} + +function unionRects(rects: Rect[]): Rect { + let minX = Number.POSITIVE_INFINITY; + let minY = Number.POSITIVE_INFINITY; + let maxX = Number.NEGATIVE_INFINITY; + let maxY = Number.NEGATIVE_INFINITY; + for (const rect of rects) { + minX = Math.min(minX, rect.x); + minY = Math.min(minY, rect.y); + maxX = Math.max(maxX, rect.x + rect.width); + maxY = Math.max(maxY, rect.y + rect.height); + } + return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; +} + +function cleanOcrAnchorText(text: string): string { + return text + .trim() + .replace(/^[^\p{L}\p{N}]+/u, '') + .replace(/^\p{L}\s+/u, ''); +} + +function hasUsefulComponentSize(component: MutableComponent): boolean { + const rect = componentToRect(component); + return ( + component.differentPixels >= MIN_COMPONENT_PIXELS && + rect.width >= MIN_COMPONENT_SIDE && + rect.height >= MIN_COMPONENT_SIDE + ); +} + +function componentToRect(component: MutableComponent): Rect { + return { + x: component.minX, + y: component.minY, + width: component.maxX - component.minX + 1, + height: component.maxY - component.minY + 1, + }; +} + +function expandRect(rect: Rect, padding: number): Rect { + return { + x: rect.x - padding, + y: rect.y - padding, + width: rect.width + padding * 2, + height: rect.height + padding * 2, + }; +} + +function clearRect(mask: Uint8Array, width: number, height: number, rect: Rect): void { + const minX = clamp(Math.floor(rect.x), 0, width - 1); + const minY = clamp(Math.floor(rect.y), 0, height - 1); + const maxX = clamp(Math.ceil(rect.x + rect.width), 0, width); + const maxY = clamp(Math.ceil(rect.y + rect.height), 0, height); + for (let y = minY; y < maxY; y += 1) { + for (let x = minX; x < maxX; x += 1) { + mask[y * width + x] = 0; + } + } +} + +function componentsAreNear( + left: MutableComponent, + right: MutableComponent, + gapPx: number, +): boolean { + return ( + left.minX - gapPx <= right.maxX && + right.minX - gapPx <= left.maxX && + left.minY - gapPx <= right.maxY && + right.minY - gapPx <= left.maxY + ); +} + +function intersectArea(left: Rect, right: Rect): number { + const minX = Math.max(left.x, right.x); + const minY = Math.max(left.y, right.y); + const maxX = Math.min(left.x + left.width, right.x + right.width); + const maxY = Math.min(left.y + left.height, right.y + right.height); + if (maxX <= minX || maxY <= minY) return 0; + return (maxX - minX) * (maxY - minY); +} + +function verticalOverlap(left: Rect, right: Rect): number { + return Math.max( + 0, + Math.min(left.y + left.height, right.y + right.height) - Math.max(left.y, right.y), + ); +} + +function rectCenter(rect: Rect): { x: number; y: number } { + return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; +} + +function squaredDistance(left: { x: number; y: number }, right: { x: number; y: number }): number { + return (left.x - right.x) ** 2 + (left.y - right.y) ** 2; +} + +function clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); +} diff --git a/src/utils/screenshot-diff-ocr.ts b/src/utils/screenshot-diff-ocr.ts new file mode 100644 index 000000000..f0dffea0f --- /dev/null +++ b/src/utils/screenshot-diff-ocr.ts @@ -0,0 +1,388 @@ +import type { Rect } from './snapshot.ts'; +import { runCmd, whichCmd } from './exec.ts'; + +export type ScreenshotOcrBlock = { + text: string; + confidence: number; + rect: Rect; + normalizedRect: Rect; +}; + +export type ScreenshotOcrTextMatch = { + text: string; + baselineRect: Rect; + currentRect: Rect; + delta: { x: number; y: number; width: number; height: number }; + confidence: number; + possibleTextMetricMismatch: boolean; +}; + +export type ScreenshotOcrMovementCluster = { + texts: string[]; + xRange: { min: number; max: number }; + yRange: { min: number; max: number }; +}; + +export type ScreenshotOcrSummary = { + provider: 'tesseract'; + baselineBlocks: number; + currentBlocks: number; + matches: ScreenshotOcrTextMatch[]; + movementClusters?: ScreenshotOcrMovementCluster[]; +}; + +export type ScreenshotOcrAnalysis = ScreenshotOcrSummary & { + baselineBlocksRaw: ScreenshotOcrBlock[]; + currentBlocksRaw: ScreenshotOcrBlock[]; +}; + +type TesseractWord = { + key: string; + text: string; + confidence: number; + rect: Rect; +}; + +const OCR_TIMEOUT_MS = 10_000; +const MAX_OCR_MATCHES = 12; +const MAX_MOVEMENT_CLUSTERS = 4; +const MIN_CLUSTERED_MATCHES = 2; +const MOVEMENT_CLUSTER_MAX_X_SPREAD_PX = 32; +const MOVEMENT_CLUSTER_MAX_Y_SPREAD_PX = 60; +// OCR text matching uses small generic movement/shape thresholds; the fixed gap +// is only a floor before falling back to word-height-relative spacing. +const MIN_MEANINGFUL_DELTA_PX = 2; +const MIN_SEGMENT_GAP_PX = 48; +const TEXT_WIDTH_MISMATCH_RATIO = 0.08; +const TEXT_HEIGHT_MISMATCH_RATIO = 0.12; + +export async function summarizeScreenshotOcr(params: { + baselinePath: string; + currentPath: string; + width: number; + height: number; +}): Promise { + if (!(await whichCmd('tesseract'))) return undefined; + + try { + const [baselineResult, currentResult] = await Promise.all([ + runTesseractTsv(params.baselinePath), + runTesseractTsv(params.currentPath), + ]); + if (baselineResult.exitCode !== 0 || currentResult.exitCode !== 0) return undefined; + + const baselineBlocks = parseTesseractTsv(baselineResult.stdout, params.width, params.height); + const currentBlocks = parseTesseractTsv(currentResult.stdout, params.width, params.height); + const matches = matchOcrBlocks(baselineBlocks, currentBlocks); + const movementClusters = summarizeOcrMovementClusters(matches); + if (baselineBlocks.length === 0 && currentBlocks.length === 0) return undefined; + + return { + provider: 'tesseract', + baselineBlocks: baselineBlocks.length, + currentBlocks: currentBlocks.length, + baselineBlocksRaw: baselineBlocks, + currentBlocksRaw: currentBlocks, + matches, + ...(movementClusters.length > 0 ? { movementClusters } : {}), + }; + } catch { + return undefined; + } +} + +export function parseTesseractTsv( + tsv: string, + imageWidth: number, + imageHeight: number, +): ScreenshotOcrBlock[] { + const [headerLine, ...lines] = tsv.split(/\r?\n/); + if (!headerLine) return []; + + const headers = headerLine.split('\t'); + const indexByName = new Map(headers.map((header, index) => [header, index])); + const words: TesseractWord[] = []; + for (const line of lines) { + if (!line.trim()) continue; + const values = line.split('\t'); + const level = readTsvNumber(values, indexByName, 'level'); + const rawText = readTsvString(values, indexByName, 'text').trim(); + const confidence = readTsvNumber(values, indexByName, 'conf'); + // Tesseract TSV uses level=5 for word rows; higher-level rows are page/block/line containers. + if (level !== 5 || !isMeaningfulText(rawText) || confidence < 0) continue; + + const left = readTsvNumber(values, indexByName, 'left'); + const top = readTsvNumber(values, indexByName, 'top'); + const width = readTsvNumber(values, indexByName, 'width'); + const height = readTsvNumber(values, indexByName, 'height'); + if (width <= 0 || height <= 0) continue; + + words.push({ + key: [ + readTsvString(values, indexByName, 'page_num'), + readTsvString(values, indexByName, 'block_num'), + readTsvString(values, indexByName, 'par_num'), + readTsvString(values, indexByName, 'line_num'), + ].join(':'), + text: rawText, + confidence, + rect: { x: left, y: top, width, height }, + }); + } + + const wordsByLine = new Map(); + for (const word of words) { + const existing = wordsByLine.get(word.key); + if (existing) existing.push(word); + else wordsByLine.set(word.key, [word]); + } + + return Array.from(wordsByLine.values()) + .flatMap((lineWords) => splitLineWordsIntoSegments(lineWords)) + .map((segmentWords) => toOcrBlock(segmentWords, imageWidth, imageHeight)) + .filter((block): block is ScreenshotOcrBlock => block !== null); +} + +export function matchOcrBlocks( + baselineBlocks: ScreenshotOcrBlock[], + currentBlocks: ScreenshotOcrBlock[], +): ScreenshotOcrTextMatch[] { + const usedCurrent = new Set(); + const matches: ScreenshotOcrTextMatch[] = []; + + for (const baselineBlock of baselineBlocks) { + const normalizedText = normalizeTextForMatching(baselineBlock.text); + const currentIndex = findBestCurrentMatch( + baselineBlock, + normalizedText, + currentBlocks, + usedCurrent, + ); + if (currentIndex === null) continue; + usedCurrent.add(currentIndex); + + const currentBlock = currentBlocks[currentIndex]!; + const match = toOcrTextMatch(baselineBlock, currentBlock); + if (!hasMeaningfulOcrDelta(match)) continue; + matches.push(match); + } + + return matches + .sort((left, right) => scoreOcrMatch(right) - scoreOcrMatch(left)) + .slice(0, MAX_OCR_MATCHES); +} + +function runTesseractTsv(imagePath: string): ReturnType { + return runCmd('tesseract', [imagePath, 'stdout', '-l', 'eng', 'tsv'], { + allowFailure: true, + timeoutMs: OCR_TIMEOUT_MS, + }); +} + +function toOcrBlock( + words: TesseractWord[], + imageWidth: number, + imageHeight: number, +): ScreenshotOcrBlock | null { + if (words.length === 0) return null; + const sortedWords = [...words].sort((left, right) => left.rect.x - right.rect.x); + const rect = unionRects(sortedWords.map((word) => word.rect)); + const confidence = Math.round(average(sortedWords.map((word) => word.confidence)) * 100) / 100; + return { + text: sortedWords.map((word) => word.text).join(' '), + confidence, + rect, + normalizedRect: { + x: roundPercentage(rect.x / imageWidth), + y: roundPercentage(rect.y / imageHeight), + width: roundPercentage(rect.width / imageWidth), + height: roundPercentage(rect.height / imageHeight), + }, + }; +} + +function splitLineWordsIntoSegments(words: TesseractWord[]): TesseractWord[][] { + const sortedWords = [...words].sort((left, right) => left.rect.x - right.rect.x); + const segments: TesseractWord[][] = []; + let currentSegment: TesseractWord[] = []; + for (const word of sortedWords) { + const previousWord = currentSegment.at(-1); + if (!previousWord) { + currentSegment.push(word); + continue; + } + + const gap = word.rect.x - (previousWord.rect.x + previousWord.rect.width); + const height = Math.max(previousWord.rect.height, word.rect.height); + if (gap > Math.max(MIN_SEGMENT_GAP_PX, height * 2.5)) { + segments.push(currentSegment); + currentSegment = [word]; + continue; + } + currentSegment.push(word); + } + if (currentSegment.length > 0) segments.push(currentSegment); + return segments; +} + +function findBestCurrentMatch( + baselineBlock: ScreenshotOcrBlock, + normalizedText: string, + currentBlocks: ScreenshotOcrBlock[], + usedCurrent: Set, +): number | null { + let bestIndex: number | null = null; + let bestDistance = Number.POSITIVE_INFINITY; + for (let index = 0; index < currentBlocks.length; index += 1) { + if (usedCurrent.has(index)) continue; + const currentBlock = currentBlocks[index]!; + if (normalizeTextForMatching(currentBlock.text) !== normalizedText) continue; + const distance = squaredDistance( + rectCenter(baselineBlock.normalizedRect), + rectCenter(currentBlock.normalizedRect), + ); + if (distance >= bestDistance) continue; + bestIndex = index; + bestDistance = distance; + } + return bestIndex; +} + +function toOcrTextMatch( + baselineBlock: ScreenshotOcrBlock, + currentBlock: ScreenshotOcrBlock, +): ScreenshotOcrTextMatch { + const delta = { + x: currentBlock.rect.x - baselineBlock.rect.x, + y: currentBlock.rect.y - baselineBlock.rect.y, + width: currentBlock.rect.width - baselineBlock.rect.width, + height: currentBlock.rect.height - baselineBlock.rect.height, + }; + const widthRatio = roundRatio(currentBlock.rect.width / baselineBlock.rect.width); + const heightRatio = roundRatio(currentBlock.rect.height / baselineBlock.rect.height); + const possibleTextMetricMismatch = + Math.abs(widthRatio - 1) >= TEXT_WIDTH_MISMATCH_RATIO || + Math.abs(heightRatio - 1) >= TEXT_HEIGHT_MISMATCH_RATIO; + return { + text: baselineBlock.text, + baselineRect: baselineBlock.rect, + currentRect: currentBlock.rect, + delta, + confidence: Math.round(Math.min(baselineBlock.confidence, currentBlock.confidence) * 100) / 100, + possibleTextMetricMismatch, + }; +} + +function hasMeaningfulOcrDelta(match: ScreenshotOcrTextMatch): boolean { + return ( + Math.abs(match.delta.x) >= MIN_MEANINGFUL_DELTA_PX || + Math.abs(match.delta.y) >= MIN_MEANINGFUL_DELTA_PX || + Math.abs(match.delta.width) >= MIN_MEANINGFUL_DELTA_PX || + Math.abs(match.delta.height) >= MIN_MEANINGFUL_DELTA_PX || + match.possibleTextMetricMismatch + ); +} + +function scoreOcrMatch(match: ScreenshotOcrTextMatch): number { + return ( + Math.abs(match.delta.x) + + Math.abs(match.delta.y) + + Math.abs(match.delta.width) + + Math.abs(match.delta.height) + + (match.possibleTextMetricMismatch ? 25 : 0) + ); +} + +export function summarizeOcrMovementClusters( + matches: ScreenshotOcrTextMatch[], +): ScreenshotOcrMovementCluster[] { + const clusters: ScreenshotOcrTextMatch[][] = []; + for (const match of [...matches].sort( + (left, right) => left.currentRect.y - right.currentRect.y, + )) { + const cluster = clusters.find( + (candidate) => + Math.abs(match.delta.x - average(candidate.map((item) => item.delta.x))) <= + MOVEMENT_CLUSTER_MAX_X_SPREAD_PX, + ); + if (cluster) cluster.push(match); + else clusters.push([match]); + } + + return clusters + .filter((cluster) => cluster.length >= MIN_CLUSTERED_MATCHES) + .map(toMovementCluster) + .filter( + (cluster) => cluster.yRange.max - cluster.yRange.min <= MOVEMENT_CLUSTER_MAX_Y_SPREAD_PX, + ) + .sort((left, right) => scoreMovementCluster(right) - scoreMovementCluster(left)) + .slice(0, MAX_MOVEMENT_CLUSTERS); +} + +function toMovementCluster(matches: ScreenshotOcrTextMatch[]): ScreenshotOcrMovementCluster { + const xDeltas = matches.map((match) => match.delta.x); + const yDeltas = matches.map((match) => match.delta.y); + return { + texts: matches.map((match) => match.text), + xRange: { min: Math.min(...xDeltas), max: Math.max(...xDeltas) }, + yRange: { min: Math.min(...yDeltas), max: Math.max(...yDeltas) }, + }; +} + +function scoreMovementCluster(cluster: ScreenshotOcrMovementCluster): number { + const averageX = (cluster.xRange.min + cluster.xRange.max) / 2; + const averageY = (cluster.yRange.min + cluster.yRange.max) / 2; + return Math.abs(averageX) * 2 + Math.abs(averageY); +} + +function unionRects(rects: Rect[]): Rect { + let minX = Number.POSITIVE_INFINITY; + let minY = Number.POSITIVE_INFINITY; + let maxX = Number.NEGATIVE_INFINITY; + let maxY = Number.NEGATIVE_INFINITY; + for (const rect of rects) { + minX = Math.min(minX, rect.x); + minY = Math.min(minY, rect.y); + maxX = Math.max(maxX, rect.x + rect.width); + maxY = Math.max(maxY, rect.y + rect.height); + } + return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; +} + +function rectCenter(rect: Rect): { x: number; y: number } { + return { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; +} + +function squaredDistance(left: { x: number; y: number }, right: { x: number; y: number }): number { + return (left.x - right.x) ** 2 + (left.y - right.y) ** 2; +} + +function readTsvString(values: string[], indexByName: Map, name: string): string { + const index = indexByName.get(name); + return index === undefined ? '' : (values[index] ?? ''); +} + +function readTsvNumber(values: string[], indexByName: Map, name: string): number { + const value = Number(readTsvString(values, indexByName, name)); + return Number.isFinite(value) ? value : 0; +} + +function isMeaningfulText(text: string): boolean { + return /[\p{L}\p{N}]/u.test(text); +} + +function normalizeTextForMatching(text: string): string { + return text.trim().replace(/\s+/g, ' ').toLowerCase(); +} + +function average(values: number[]): number { + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} + +function roundRatio(ratio: number): number { + return Math.round(ratio * 1000) / 1000; +} diff --git a/src/utils/screenshot-diff-overlay-matches.ts b/src/utils/screenshot-diff-overlay-matches.ts new file mode 100644 index 000000000..161c44c98 --- /dev/null +++ b/src/utils/screenshot-diff-overlay-matches.ts @@ -0,0 +1,70 @@ +import type { + ScreenshotDiffRegion, + ScreenshotDiffRegionOverlayMatch, +} from './screenshot-diff-regions.ts'; +import type { Rect, ScreenshotOverlayRef } from './snapshot.ts'; + +const MAX_MATCHES_PER_REGION = 3; + +export function attachCurrentOverlayMatches( + regions: ScreenshotDiffRegion[], + overlayRefs: ScreenshotOverlayRef[], +): ScreenshotDiffRegion[] { + return regions.map((region) => { + const matches = findRegionOverlayMatches(region, overlayRefs); + return matches.length > 0 ? { ...region, currentOverlayMatches: matches } : region; + }); +} + +function findRegionOverlayMatches( + region: ScreenshotDiffRegion, + overlayRefs: ScreenshotOverlayRef[], +): ScreenshotDiffRegionOverlayMatch[] { + const regionArea = rectArea(region.rect); + return overlayRefs + .map((overlayRef) => { + const overlayRect = overlayRef.overlayRect; + const overlapArea = intersectArea(region.rect, overlayRect); + if (overlapArea <= 0) return null; + return { + ref: overlayRef.ref, + ...(overlayRef.label ? { label: overlayRef.label } : {}), + rect: overlayRect, + overlayCoveragePercentage: roundPercentage(overlapArea / rectArea(overlayRect)), + regionCoveragePercentage: roundPercentage(overlapArea / regionArea), + }; + }) + .filter( + (match): match is ScreenshotDiffRegionOverlayMatch & { overlayCoveragePercentage: number } => + match !== null, + ) + .sort((left, right) => { + const coverageDelta = right.regionCoveragePercentage - left.regionCoveragePercentage; + if (coverageDelta !== 0) return coverageDelta; + return right.overlayCoveragePercentage - left.overlayCoveragePercentage; + }) + .slice(0, MAX_MATCHES_PER_REGION) + .map((match) => ({ + ref: match.ref, + ...(match.label ? { label: match.label } : {}), + rect: match.rect, + regionCoveragePercentage: match.regionCoveragePercentage, + })); +} + +function intersectArea(left: Rect, right: Rect): number { + const minX = Math.max(left.x, right.x); + const minY = Math.max(left.y, right.y); + const maxX = Math.min(left.x + left.width, right.x + right.width); + const maxY = Math.min(left.y + left.height, right.y + right.height); + if (maxX <= minX || maxY <= minY) return 0; + return (maxX - minX) * (maxY - minY); +} + +function rectArea(rect: Rect): number { + return rect.width * rect.height; +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} diff --git a/src/utils/screenshot-diff-region-overlay.ts b/src/utils/screenshot-diff-region-overlay.ts new file mode 100644 index 000000000..516a7fefb --- /dev/null +++ b/src/utils/screenshot-diff-region-overlay.ts @@ -0,0 +1,53 @@ +import { PNG } from 'pngjs'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; + +const REGION_BORDER_COLOR = [0, 187, 255, 255] as const; +const REGION_BORDER_THICKNESS = 2; +const MIN_ANNOTATED_REGION_SIDE = 4; + +export function annotateDiffRegions(diff: PNG, regions: ScreenshotDiffRegion[]): void { + for (const region of regions) { + if ( + region.rect.width < MIN_ANNOTATED_REGION_SIDE || + region.rect.height < MIN_ANNOTATED_REGION_SIDE + ) { + continue; + } + drawRect(diff, region.rect); + } +} + +function drawRect(diff: PNG, rect: ScreenshotDiffRegion['rect']): void { + const minX = clamp(rect.x, 0, diff.width - 1); + const minY = clamp(rect.y, 0, diff.height - 1); + const maxX = clamp(rect.x + rect.width - 1, 0, diff.width - 1); + const maxY = clamp(rect.y + rect.height - 1, 0, diff.height - 1); + for (let thickness = 0; thickness < REGION_BORDER_THICKNESS; thickness += 1) { + for (let x = minX; x <= maxX; x += 1) { + setPixel(diff, x, minY + thickness, REGION_BORDER_COLOR); + setPixel(diff, x, maxY - thickness, REGION_BORDER_COLOR); + } + for (let y = minY; y <= maxY; y += 1) { + setPixel(diff, minX + thickness, y, REGION_BORDER_COLOR); + setPixel(diff, maxX - thickness, y, REGION_BORDER_COLOR); + } + } +} + +function setPixel( + diff: PNG, + x: number, + y: number, + color: readonly [number, number, number, number], +): void { + if (x < 0 || x >= diff.width || y < 0 || y >= diff.height) return; + const index = (y * diff.width + x) * 4; + diff.data[index] = color[0]; + diff.data[index + 1] = color[1]; + diff.data[index + 2] = color[2]; + diff.data[index + 3] = color[3]; +} + +function clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); +} diff --git a/src/utils/screenshot-diff-region-split.ts b/src/utils/screenshot-diff-region-split.ts new file mode 100644 index 000000000..a84c44a46 --- /dev/null +++ b/src/utils/screenshot-diff-region-split.ts @@ -0,0 +1,199 @@ +import { PNG } from 'pngjs'; +import type { MutableDiffRegion } from './screenshot-diff-regions.ts'; + +// Region splitting is based on screen-relative heights so it works on phone, +// tablet, and desktop screenshots; the pixel floors only suppress tiny fixtures/noise. +const MIN_SPLIT_REGION_HEIGHT_RATIO = 0.07; +const MIN_SPLIT_REGION_HEIGHT_FLOOR_PX = 48; +const MIN_SPLIT_REGION_WIDTH_RATIO = 0.35; +const MIN_SPLIT_SEGMENT_HEIGHT_RATIO = 0.03; +const MIN_SPLIT_SEGMENT_HEIGHT_FLOOR_PX = 24; +const LOW_DENSITY_RATIO = 0.08; +const MIN_LOW_DENSITY_BAND_HEIGHT = 6; +const ROW_SMOOTHING_RADIUS = 3; + +export function splitLargeDiffRegions( + regions: MutableDiffRegion[], + params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, +): MutableDiffRegion[] { + return regions.flatMap((region) => + shouldSplitRegion(region, params.baseline.width, params.baseline.height) + ? splitRegionByHorizontalDensity( + region, + params, + minSplitSegmentHeight(params.baseline.height), + ) + : [region], + ); +} + +function shouldSplitRegion( + region: MutableDiffRegion, + imageWidth: number, + imageHeight: number, +): boolean { + const width = region.maxX - region.minX + 1; + const height = region.maxY - region.minY + 1; + return ( + height >= minSplitRegionHeight(imageHeight) && + width >= imageWidth * MIN_SPLIT_REGION_WIDTH_RATIO + ); +} + +function splitRegionByHorizontalDensity( + region: MutableDiffRegion, + params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, + minSegmentHeight: number, +): MutableDiffRegion[] { + const rowCounts = measureRowDiffCounts(region, params.diffMask, params.baseline.width); + const smoothed = smoothCounts(rowCounts); + const lowDensityBands = findLowDensityBands( + smoothed, + Math.max(1, Math.round((region.maxX - region.minX + 1) * LOW_DENSITY_RATIO)), + ); + const ranges = buildSegmentRanges(region, lowDensityBands, minSegmentHeight); + if (ranges.length <= 1) return [region]; + + const splitRegions = ranges + .map(([minY, maxY]) => buildRegionSlice(region, minY, maxY, params)) + .filter((slice): slice is MutableDiffRegion => slice !== null); + return splitRegions.length > 1 ? splitRegions : [region]; +} + +function measureRowDiffCounts( + region: MutableDiffRegion, + diffMask: Uint8Array, + imageWidth: number, +): number[] { + const counts: number[] = []; + for (let y = region.minY; y <= region.maxY; y += 1) { + let count = 0; + for (let x = region.minX; x <= region.maxX; x += 1) { + if (diffMask[y * imageWidth + x] === 1) count += 1; + } + counts.push(count); + } + return counts; +} + +function smoothCounts(counts: number[]): number[] { + return counts.map((_, index) => { + let sum = 0; + let samples = 0; + const start = Math.max(0, index - ROW_SMOOTHING_RADIUS); + const end = Math.min(counts.length - 1, index + ROW_SMOOTHING_RADIUS); + for (let sample = start; sample <= end; sample += 1) { + sum += counts[sample]!; + samples += 1; + } + return Math.round(sum / samples); + }); +} + +function findLowDensityBands(counts: number[], threshold: number): Array<[number, number]> { + const bands: Array<[number, number]> = []; + let start: number | null = null; + for (let index = 0; index < counts.length; index += 1) { + if (counts[index]! <= threshold) { + start ??= index; + continue; + } + if (start !== null) { + if (index - start >= MIN_LOW_DENSITY_BAND_HEIGHT) bands.push([start, index - 1]); + start = null; + } + } + if (start !== null && counts.length - start >= MIN_LOW_DENSITY_BAND_HEIGHT) { + bands.push([start, counts.length - 1]); + } + return bands; +} + +function buildSegmentRanges( + region: MutableDiffRegion, + lowDensityBands: Array<[number, number]>, + minSegmentHeight: number, +): Array<[number, number]> { + const ranges: Array<[number, number]> = []; + let segmentStart = region.minY; + for (const [relativeStart, relativeEnd] of lowDensityBands) { + const cutY = region.minY + Math.round((relativeStart + relativeEnd) / 2); + if (cutY - segmentStart + 1 < minSegmentHeight || region.maxY - cutY < minSegmentHeight) { + continue; + } + ranges.push([segmentStart, cutY]); + segmentStart = cutY + 1; + } + ranges.push([segmentStart, region.maxY]); + return ranges; +} + +function minSplitRegionHeight(imageHeight: number): number { + return Math.max( + MIN_SPLIT_REGION_HEIGHT_FLOOR_PX, + Math.round(imageHeight * MIN_SPLIT_REGION_HEIGHT_RATIO), + ); +} + +function minSplitSegmentHeight(imageHeight: number): number { + return Math.max( + MIN_SPLIT_SEGMENT_HEIGHT_FLOOR_PX, + Math.round(imageHeight * MIN_SPLIT_SEGMENT_HEIGHT_RATIO), + ); +} + +function buildRegionSlice( + region: MutableDiffRegion, + minY: number, + maxY: number, + params: { diffMask: Uint8Array; baseline: PNG; current: PNG }, +): MutableDiffRegion | null { + let slice: MutableDiffRegion | null = null; + for (let y = minY; y <= maxY; y += 1) { + for (let x = region.minX; x <= region.maxX; x += 1) { + const pixelIndex = y * params.baseline.width + x; + if (params.diffMask[pixelIndex] !== 1) continue; + slice ??= createEmptyRegion(x, y); + addPixelToSlice(slice, pixelIndex, x, y, params.baseline, params.current); + } + } + return slice; +} + +function createEmptyRegion(x: number, y: number): MutableDiffRegion { + return { + minX: x, + minY: y, + maxX: x, + maxY: y, + differentPixels: 0, + baselineRed: 0, + baselineGreen: 0, + baselineBlue: 0, + currentRed: 0, + currentGreen: 0, + currentBlue: 0, + }; +} + +function addPixelToSlice( + slice: MutableDiffRegion, + pixelIndex: number, + x: number, + y: number, + baseline: PNG, + current: PNG, +): void { + const dataIndex = pixelIndex * 4; + slice.minX = Math.min(slice.minX, x); + slice.minY = Math.min(slice.minY, y); + slice.maxX = Math.max(slice.maxX, x); + slice.maxY = Math.max(slice.maxY, y); + slice.differentPixels += 1; + slice.baselineRed += baseline.data[dataIndex]!; + slice.baselineGreen += baseline.data[dataIndex + 1]!; + slice.baselineBlue += baseline.data[dataIndex + 2]!; + slice.currentRed += current.data[dataIndex]!; + slice.currentGreen += current.data[dataIndex + 1]!; + slice.currentBlue += current.data[dataIndex + 2]!; +} diff --git a/src/utils/screenshot-diff-regions.ts b/src/utils/screenshot-diff-regions.ts new file mode 100644 index 000000000..fd736b478 --- /dev/null +++ b/src/utils/screenshot-diff-regions.ts @@ -0,0 +1,363 @@ +import { PNG } from 'pngjs'; +import { splitLargeDiffRegions } from './screenshot-diff-region-split.ts'; + +type ScreenshotDiffColor = { + r: number; + g: number; + b: number; +}; + +export type ScreenshotDiffRegion = { + index: number; + rect: { x: number; y: number; width: number; height: number }; + normalizedRect: { x: number; y: number; width: number; height: number }; + differentPixels: number; + shareOfDiffPercentage: number; + densityPercentage: number; + shape: 'compact' | 'horizontal-band' | 'vertical-band' | 'large-area'; + size: 'small' | 'medium' | 'large'; + location: string; + averageBaselineColorHex: string; + averageCurrentColorHex: string; + baselineLuminance: number; + currentLuminance: number; + dominantChange: 'brighter' | 'darker' | 'color-shift' | 'mixed'; + currentOverlayMatches?: ScreenshotDiffRegionOverlayMatch[]; +}; + +export type ScreenshotDiffRegionOverlayMatch = { + ref: string; + label?: string; + regionCoveragePercentage: number; + rect: { x: number; y: number; width: number; height: number }; +}; + +const DEFAULT_MAX_DIFF_REGIONS = 8; +const REGION_MERGE_GAP_PX = 12; +const MAX_REGIONS_TO_MERGE = 2000; +// These region labels are coarse, screen-relative buckets for agent guidance, +// not tuned to a specific screenshot size or app layout. +const DOMINANT_CHANGE_MIN_CHANNEL_DELTA = 12; +const LARGE_AREA_MIN_WIDTH_RATIO = 0.55; +const LARGE_AREA_MIN_HEIGHT_RATIO = 0.12; +const BAND_MIN_ASPECT_RATIO = 2.5; +const LARGE_REGION_MIN_AREA_RATIO = 0.04; +const MEDIUM_REGION_MIN_AREA_RATIO = 0.01; + +export type MutableDiffRegion = { + minX: number; + minY: number; + maxX: number; + maxY: number; + differentPixels: number; + baselineRed: number; + baselineGreen: number; + baselineBlue: number; + currentRed: number; + currentGreen: number; + currentBlue: number; +}; + +export function summarizeDiffRegions(params: { + diffMask: Uint8Array; + baseline: PNG; + current: PNG; + totalPixels: number; + differentPixels: number; + maxRegions?: number; +}): ScreenshotDiffRegion[] { + const rawRegions = findConnectedDiffRegions(params); + // Avoid quadratic nearby-merge work on extremely noisy diffs; the later ranking + // still keeps the largest components, but tiny speckles may remain unmerged. + const mergedRegions = + rawRegions.length <= MAX_REGIONS_TO_MERGE + ? mergeNearbyRegions(rawRegions, REGION_MERGE_GAP_PX) + : rawRegions; + const splitRegions = splitLargeDiffRegions(mergedRegions, params); + return splitRegions + .sort((left, right) => { + const pixelDelta = right.differentPixels - left.differentPixels; + if (pixelDelta !== 0) return pixelDelta; + const topDelta = left.minY - right.minY; + if (topDelta !== 0) return topDelta; + return left.minX - right.minX; + }) + .slice(0, Math.max(0, params.maxRegions ?? DEFAULT_MAX_DIFF_REGIONS)) + .map((region, index) => + toScreenshotDiffRegion(region, index + 1, { + width: params.baseline.width, + height: params.baseline.height, + totalPixels: params.totalPixels, + differentPixels: params.differentPixels, + }), + ); +} + +function findConnectedDiffRegions(params: { + diffMask: Uint8Array; + baseline: PNG; + current: PNG; +}): MutableDiffRegion[] { + const { diffMask, baseline, current } = params; + const { width, height } = baseline; + const visited = new Uint8Array(diffMask.length); + const queue = new Int32Array(diffMask.length); + const regions: MutableDiffRegion[] = []; + + for (let pixelIndex = 0; pixelIndex < diffMask.length; pixelIndex += 1) { + if (diffMask[pixelIndex] !== 1 || visited[pixelIndex] === 1) continue; + + let queueStart = 0; + let queueEnd = 0; + queue[queueEnd] = pixelIndex; + queueEnd += 1; + visited[pixelIndex] = 1; + + const startX = pixelIndex % width; + const startY = Math.floor(pixelIndex / width); + const region: MutableDiffRegion = { + minX: startX, + minY: startY, + maxX: startX, + maxY: startY, + differentPixels: 0, + baselineRed: 0, + baselineGreen: 0, + baselineBlue: 0, + currentRed: 0, + currentGreen: 0, + currentBlue: 0, + }; + + while (queueStart < queueEnd) { + const currentPixelIndex = queue[queueStart]!; + queueStart += 1; + addPixelToRegion(region, currentPixelIndex, width, baseline, current); + + const x = currentPixelIndex % width; + const y = Math.floor(currentPixelIndex / width); + for (let yOffset = -1; yOffset <= 1; yOffset += 1) { + const neighborY = y + yOffset; + if (neighborY < 0 || neighborY >= height) continue; + for (let xOffset = -1; xOffset <= 1; xOffset += 1) { + if (xOffset === 0 && yOffset === 0) continue; + const neighborX = x + xOffset; + if (neighborX < 0 || neighborX >= width) continue; + const neighborIndex = neighborY * width + neighborX; + if (diffMask[neighborIndex] !== 1 || visited[neighborIndex] === 1) continue; + visited[neighborIndex] = 1; + queue[queueEnd] = neighborIndex; + queueEnd += 1; + } + } + } + + regions.push(region); + } + + return regions; +} + +function addPixelToRegion( + region: MutableDiffRegion, + pixelIndex: number, + width: number, + baseline: PNG, + current: PNG, +): void { + const x = pixelIndex % width; + const y = Math.floor(pixelIndex / width); + const dataIndex = pixelIndex * 4; + region.minX = Math.min(region.minX, x); + region.minY = Math.min(region.minY, y); + region.maxX = Math.max(region.maxX, x); + region.maxY = Math.max(region.maxY, y); + region.differentPixels += 1; + region.baselineRed += baseline.data[dataIndex]!; + region.baselineGreen += baseline.data[dataIndex + 1]!; + region.baselineBlue += baseline.data[dataIndex + 2]!; + region.currentRed += current.data[dataIndex]!; + region.currentGreen += current.data[dataIndex + 1]!; + region.currentBlue += current.data[dataIndex + 2]!; +} + +function mergeNearbyRegions(regions: MutableDiffRegion[], gapPx: number): MutableDiffRegion[] { + const merged: MutableDiffRegion[] = []; + for (const region of regions.sort((left, right) => { + const topDelta = left.minY - right.minY; + if (topDelta !== 0) return topDelta; + return left.minX - right.minX; + })) { + const existing = merged.find((candidate) => regionsAreNear(candidate, region, gapPx)); + if (!existing) { + merged.push({ ...region }); + continue; + } + mergeRegionInto(existing, region); + } + return merged; +} + +function regionsAreNear(left: MutableDiffRegion, right: MutableDiffRegion, gapPx: number): boolean { + return ( + left.minX - gapPx <= right.maxX && + right.minX - gapPx <= left.maxX && + left.minY - gapPx <= right.maxY && + right.minY - gapPx <= left.maxY + ); +} + +function mergeRegionInto(target: MutableDiffRegion, source: MutableDiffRegion): void { + target.minX = Math.min(target.minX, source.minX); + target.minY = Math.min(target.minY, source.minY); + target.maxX = Math.max(target.maxX, source.maxX); + target.maxY = Math.max(target.maxY, source.maxY); + target.differentPixels += source.differentPixels; + target.baselineRed += source.baselineRed; + target.baselineGreen += source.baselineGreen; + target.baselineBlue += source.baselineBlue; + target.currentRed += source.currentRed; + target.currentGreen += source.currentGreen; + target.currentBlue += source.currentBlue; +} + +function toScreenshotDiffRegion( + region: MutableDiffRegion, + index: number, + image: { width: number; height: number; totalPixels: number; differentPixels: number }, +): ScreenshotDiffRegion { + const rect = { + x: region.minX, + y: region.minY, + width: region.maxX - region.minX + 1, + height: region.maxY - region.minY + 1, + }; + const center = { + x: Math.round(region.minX + rect.width / 2), + y: Math.round(region.minY + rect.height / 2), + }; + const averageBaselineColor = averageRegionColor( + region.baselineRed, + region.baselineGreen, + region.baselineBlue, + region.differentPixels, + ); + const averageCurrentColor = averageRegionColor( + region.currentRed, + region.currentGreen, + region.currentBlue, + region.differentPixels, + ); + const regionArea = rect.width * rect.height; + const densityPercentage = roundPercentage(region.differentPixels / regionArea); + const baselineLuminance = Math.round(luminance(averageBaselineColor)); + const currentLuminance = Math.round(luminance(averageCurrentColor)); + const shape = describeRegionShape(rect, image.width, image.height); + const size = describeRegionSize(regionArea, image.totalPixels); + const dominantChange = describeDominantChange(averageBaselineColor, averageCurrentColor); + const location = describeRegionLocation(center, image.width, image.height); + return { + index, + rect, + normalizedRect: { + x: roundPercentage(rect.x / image.width), + y: roundPercentage(rect.y / image.height), + width: roundPercentage(rect.width / image.width), + height: roundPercentage(rect.height / image.height), + }, + differentPixels: region.differentPixels, + shareOfDiffPercentage: roundPercentage(region.differentPixels / image.differentPixels), + densityPercentage, + shape, + size, + location, + averageBaselineColorHex: toHexColor(averageBaselineColor), + averageCurrentColorHex: toHexColor(averageCurrentColor), + baselineLuminance, + currentLuminance, + dominantChange, + }; +} + +function averageRegionColor( + red: number, + green: number, + blue: number, + pixels: number, +): ScreenshotDiffColor { + return { + r: Math.round(red / pixels), + g: Math.round(green / pixels), + b: Math.round(blue / pixels), + }; +} + +function describeRegionLocation( + center: { x: number; y: number }, + width: number, + height: number, +): string { + const horizontal = + center.x < width / 3 ? 'left' : center.x > (width * 2) / 3 ? 'right' : 'center'; + const vertical = + center.y < height / 3 ? 'top' : center.y > (height * 2) / 3 ? 'bottom' : 'middle'; + return horizontal === 'center' && vertical === 'middle' ? 'center' : `${vertical}-${horizontal}`; +} + +function describeDominantChange( + baseline: ScreenshotDiffColor, + current: ScreenshotDiffColor, +): ScreenshotDiffRegion['dominantChange'] { + const baselineLuminance = luminance(baseline); + const currentLuminance = luminance(current); + const luminanceDelta = currentLuminance - baselineLuminance; + if (Math.abs(luminanceDelta) >= DOMINANT_CHANGE_MIN_CHANNEL_DELTA) { + return luminanceDelta > 0 ? 'brighter' : 'darker'; + } + + const maxChannelDelta = Math.max( + Math.abs(current.r - baseline.r), + Math.abs(current.g - baseline.g), + Math.abs(current.b - baseline.b), + ); + return maxChannelDelta >= DOMINANT_CHANGE_MIN_CHANNEL_DELTA ? 'color-shift' : 'mixed'; +} + +function describeRegionShape( + rect: { width: number; height: number }, + imageWidth: number, + imageHeight: number, +): ScreenshotDiffRegion['shape'] { + if ( + rect.width >= imageWidth * LARGE_AREA_MIN_WIDTH_RATIO && + rect.height >= imageHeight * LARGE_AREA_MIN_HEIGHT_RATIO + ) { + return 'large-area'; + } + if (rect.width >= rect.height * BAND_MIN_ASPECT_RATIO) return 'horizontal-band'; + if (rect.height >= rect.width * BAND_MIN_ASPECT_RATIO) return 'vertical-band'; + return 'compact'; +} + +function describeRegionSize(regionArea: number, totalPixels: number): ScreenshotDiffRegion['size'] { + const areaRatio = regionArea / totalPixels; + if (areaRatio >= LARGE_REGION_MIN_AREA_RATIO) return 'large'; + if (areaRatio >= MEDIUM_REGION_MIN_AREA_RATIO) return 'medium'; + return 'small'; +} + +function luminance(color: ScreenshotDiffColor): number { + return color.r * 0.2126 + color.g * 0.7152 + color.b * 0.0722; +} + +function toHexColor(color: ScreenshotDiffColor): string { + return `#${toHexChannel(color.r)}${toHexChannel(color.g)}${toHexChannel(color.b)}`; +} + +function toHexChannel(value: number): string { + return value.toString(16).padStart(2, '0'); +} + +function roundPercentage(ratio: number): number { + return Math.round(ratio * 100 * 100) / 100; +} diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index f1581c1a2..b15fca23c 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -3,6 +3,13 @@ import path from 'node:path'; import { PNG } from 'pngjs'; import { AppError } from '../utils/errors.ts'; import { decodePng } from './png.ts'; +import { annotateDiffRegions } from './screenshot-diff-region-overlay.ts'; +import { + summarizeNonTextDiffDeltas, + type ScreenshotNonTextDelta, +} from './screenshot-diff-non-text.ts'; +import { summarizeScreenshotOcr, type ScreenshotOcrSummary } from './screenshot-diff-ocr.ts'; +import { summarizeDiffRegions, type ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; export type ScreenshotDimensionMismatch = { expected: { width: number; height: number }; @@ -16,11 +23,17 @@ export type ScreenshotDiffResult = { mismatchPercentage: number; match: boolean; dimensionMismatch?: ScreenshotDimensionMismatch; + regions?: ScreenshotDiffRegion[]; + currentOverlayPath?: string; + currentOverlayRefCount?: number; + ocr?: ScreenshotOcrSummary; + nonTextDeltas?: ScreenshotNonTextDelta[]; }; export type ScreenshotDiffOptions = { threshold?: number; outputPath?: string; + maxRegions?: number; }; // Each pixel is a point in 3D RGB space (R, G, B each 0–255). @@ -29,6 +42,9 @@ export type ScreenshotDiffOptions = { // We use this as the denominator so threshold 0–1 maps linearly to the full // color distance range: 0 = exact match only, 1 = everything matches. const COLOR_DISTANCE_SCALE = 255 * Math.sqrt(3); +const DIFF_CONTEXT_LIGHTEN_RATIO = 0.72; +const DIFF_CHANGE_TINT_RATIO = 0.78; +const DIFF_CHANGE_COLOR = { r: 220, g: 0, b: 0 } as const; export async function compareScreenshots( baselinePath: string, @@ -69,12 +85,13 @@ export async function compareScreenshots( const totalPixels = baseline.width * baseline.height; const maxColorDistance = threshold * COLOR_DISTANCE_SCALE; const diff = new PNG({ width: baseline.width, height: baseline.height }); + const diffMask = new Uint8Array(totalPixels); let differentPixels = 0; // PNG data is a flat RGBA buffer: [R, G, B, A, R, G, B, A, ...]. // We step by 4 to visit each pixel and compute its Euclidean distance // in RGB space between the baseline and current image. - for (let index = 0; index < baseline.data.length; index += 4) { + for (let index = 0, pixelIndex = 0; index < baseline.data.length; index += 4, pixelIndex += 1) { const redDelta = baseline.data[index]! - current.data[index]!; const greenDelta = baseline.data[index + 1]! - current.data[index + 1]!; const blueDelta = baseline.data[index + 2]! - current.data[index + 2]!; @@ -82,34 +99,74 @@ export async function compareScreenshots( if (colorDistance > maxColorDistance) { differentPixels += 1; - // Red highlight for different pixels - diff.data[index] = 255; - diff.data[index + 1] = 0; - diff.data[index + 2] = 0; + diffMask[pixelIndex] = 1; + const context = renderDiffContextChannel(current, index); + diff.data[index] = tintChannel(context, DIFF_CHANGE_COLOR.r, DIFF_CHANGE_TINT_RATIO); + diff.data[index + 1] = tintChannel(context, DIFF_CHANGE_COLOR.g, DIFF_CHANGE_TINT_RATIO); + diff.data[index + 2] = tintChannel(context, DIFF_CHANGE_COLOR.b, DIFF_CHANGE_TINT_RATIO); diff.data[index + 3] = 255; continue; } - // Unchanged pixels are converted to a dimmed grayscale (30% brightness). - // This makes the diff image look like a faded version of the original with - // red pixels popping out where differences exist. - const gray = Math.round( - (baseline.data[index]! + baseline.data[index + 1]! + baseline.data[index + 2]!) / 3, - ); - const dimmed = Math.round(gray * 0.3); - diff.data[index] = dimmed; - diff.data[index + 1] = dimmed; - diff.data[index + 2] = dimmed; + const context = renderDiffContextChannel(current, index); + diff.data[index] = context; + diff.data[index + 1] = context; + diff.data[index + 2] = context; diff.data[index + 3] = 255; } + const regions = + differentPixels > 0 + ? summarizeDiffRegions({ + diffMask, + baseline, + current, + totalPixels, + differentPixels, + maxRegions: options.maxRegions, + }) + : []; + if (differentPixels > 0 && diffOutputPath) { + annotateDiffRegions(diff, regions); await fs.mkdir(path.dirname(diffOutputPath), { recursive: true }); await fs.writeFile(diffOutputPath, PNG.sync.write(diff)); } else { await removeStaleDiffOutput(options.outputPath); } + const ocrAnalysis = + differentPixels > 0 + ? await summarizeScreenshotOcr({ + baselinePath, + currentPath, + width: baseline.width, + height: baseline.height, + }) + : undefined; + const shouldIncludeOcr = + ocrAnalysis && + (ocrAnalysis.matches.length > 0 || (ocrAnalysis.movementClusters?.length ?? 0) > 0); + const ocr = shouldIncludeOcr + ? { + provider: ocrAnalysis.provider, + baselineBlocks: ocrAnalysis.baselineBlocks, + currentBlocks: ocrAnalysis.currentBlocks, + matches: ocrAnalysis.matches, + ...(ocrAnalysis.movementClusters ? { movementClusters: ocrAnalysis.movementClusters } : {}), + } + : undefined; + const nonTextDeltas = + differentPixels > 0 && ocrAnalysis + ? summarizeNonTextDiffDeltas({ + diffMask, + width: baseline.width, + height: baseline.height, + regions, + ocr: ocrAnalysis, + }) + : []; + // Round to 2 decimal places: multiply percentage by 100 before rounding, // then divide back. e.g. 0.12345 → 12.345% → round(1234.5)/100 → 12.35% const mismatchPercentage = @@ -117,6 +174,9 @@ export async function compareScreenshots( return { ...(differentPixels > 0 && diffOutputPath ? { diffPath: diffOutputPath } : {}), + ...(regions.length > 0 ? { regions } : {}), + ...(ocr ? { ocr } : {}), + ...(nonTextDeltas.length > 0 ? { nonTextDeltas } : {}), totalPixels, differentPixels, mismatchPercentage, @@ -144,3 +204,14 @@ async function removeStaleDiffOutput(outputPath: string | undefined): Promise `. - Burned-in touch overlays are exported only on macOS hosts, because the overlay pipeline depends on Swift + AVFoundation helpers. - On Linux or other non-macOS hosts, `record stop` still succeeds and returns the raw video plus telemetry sidecar, and includes `overlayWarning` when burn-in overlays were skipped.