diff --git a/src/browser/html-tree.test.ts b/src/browser/html-tree.test.ts new file mode 100644 index 000000000..7d858d37a --- /dev/null +++ b/src/browser/html-tree.test.ts @@ -0,0 +1,108 @@ +import { describe, expect, it } from 'vitest'; +import { buildHtmlTreeJs, type HtmlTreeResult } from './html-tree.js'; + +/** + * The serializer runs in a page context via `page.evaluate`. In unit tests we + * substitute `document` with a minimal stub that mirrors the DOM surface used + * by the expression, then Function-eval the returned JS. + */ +function runTreeJs(root: unknown, selectorMatches: unknown[], selector: string | null): HtmlTreeResult { + const js = buildHtmlTreeJs({ selector }); + const fakeDocument = { + querySelectorAll: () => selectorMatches, + documentElement: root, + }; + const fn = new Function('document', `return ${js};`); + return fn(fakeDocument) as HtmlTreeResult; +} + +function runTreeJsInvalid(selector: string, errorMessage: string): unknown { + const js = buildHtmlTreeJs({ selector }); + const fakeDocument = { + querySelectorAll: () => { const e = new Error(errorMessage); e.name = 'SyntaxError'; throw e; }, + documentElement: null, + }; + const fn = new Function('document', `return ${js};`); + return fn(fakeDocument); +} + +function el(tag: string, attrs: Record, children: Array): FakeEl { + return { + nodeType: 1, + tagName: tag.toUpperCase(), + attributes: Object.entries(attrs).map(([name, value]) => ({ name, value })), + childNodes: children, + }; +} + +function txt(value: string): FakeText { return { nodeType: 3, nodeValue: value }; } + +type FakeEl = { nodeType: 1; tagName: string; attributes: Array<{ name: string; value: string }>; childNodes: Array }; +type FakeText = { nodeType: 3; nodeValue: string }; +type ChildOf = FakeEl | FakeText; + +describe('buildHtmlTreeJs', () => { + it('serializes a simple element into {tag, attrs, text, children}', () => { + const root = el('div', { class: 'hero', id: 'x' }, [txt('Hello')]); + const result = runTreeJs(root, [root], null); + expect(result.selector).toBeNull(); + expect(result.matched).toBe(1); + expect(result.tree).toEqual({ + tag: 'div', + attrs: { class: 'hero', id: 'x' }, + text: 'Hello', + children: [], + }); + }); + + it('collapses whitespace in direct text content only', () => { + const root = el('p', {}, [ + txt(' line \n one '), + el('span', {}, [txt('inner text')]), + txt('\tline two\t'), + ]); + const result = runTreeJs(root, [root], null); + expect(result.tree?.text).toBe('line one line two'); + expect(result.tree?.children[0].text).toBe('inner text'); + }); + + it('recurses into element children and preserves their attrs', () => { + const root = el('ul', { role: 'list' }, [ + el('li', { 'data-id': '1' }, [txt('first')]), + el('li', { 'data-id': '2' }, [txt('second')]), + ]); + const result = runTreeJs(root, [root], null); + expect(result.tree?.children).toHaveLength(2); + expect(result.tree?.children[0]).toEqual({ + tag: 'li', + attrs: { 'data-id': '1' }, + text: 'first', + children: [], + }); + }); + + it('returns matched=N and serializes only the first match', () => { + const first = el('article', { id: 'a' }, [txt('first')]); + const second = el('article', { id: 'b' }, [txt('second')]); + const result = runTreeJs(null, [first, second], 'article'); + expect(result.matched).toBe(2); + expect(result.tree?.attrs.id).toBe('a'); + }); + + it('returns tree=null and matched=0 when selector matches nothing', () => { + const result = runTreeJs(null, [], '.nothing'); + expect(result.matched).toBe(0); + expect(result.tree).toBeNull(); + }); + + it('catches SyntaxError from querySelectorAll and returns {invalidSelector:true, reason}', () => { + const result = runTreeJsInvalid('##$@@', "'##$@@' is not a valid selector") as { + selector: string; + invalidSelector: boolean; + reason: string; + }; + expect(result.invalidSelector).toBe(true); + expect(result.selector).toBe('##$@@'); + expect(result.reason).toContain('not a valid selector'); + }); +}); diff --git a/src/browser/html-tree.ts b/src/browser/html-tree.ts new file mode 100644 index 000000000..0550e0aee --- /dev/null +++ b/src/browser/html-tree.ts @@ -0,0 +1,78 @@ +/** + * Client-side HTML → structured tree serializer. + * + * Returned as a JS string that gets passed to `page.evaluate`. The expression + * walks the DOM subtree rooted at the first selector match (or documentElement + * when no selector is given) and emits a compact `{tag, attrs, text, children}` + * tree for agents to consume instead of re-parsing raw HTML. + * + * Text handling: `text` is the concatenated text of direct text children only, + * whitespace-collapsed. Nested element text is left inside `children[].text`. + * Ordering between text and elements is not preserved — agents that need it + * should fall back to raw HTML mode. + */ + +export interface BuildHtmlTreeJsOptions { + /** CSS selector to scope the tree; unscoped = documentElement */ + selector?: string | null; +} + +/** + * Returns a JS expression string. When evaluated in a page context the + * expression resolves to either + * `{selector, matched: number, tree: HtmlNode | null}` on success, or + * `{selector, invalidSelector: true, reason}` when `querySelectorAll` + * throws a `SyntaxError` for an unparseable selector. + * + * Callers must branch on `invalidSelector` to convert it into the CLI's + * `invalid_selector` structured error; otherwise the browser-level exception + * would bubble out of `page.evaluate` and bypass the structured-error + * contract that agents rely on. + */ +export function buildHtmlTreeJs(opts: BuildHtmlTreeJsOptions = {}): string { + const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null'; + return `(() => { + const selector = ${selectorLiteral}; + let matches; + if (selector) { + try { matches = document.querySelectorAll(selector); } + catch (e) { + return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) }; + } + } else { + matches = [document.documentElement]; + } + const matched = matches.length; + const root = matches[0] || null; + function serialize(el) { + if (!el || el.nodeType !== 1) return null; + const attrs = {}; + for (const a of el.attributes) attrs[a.name] = a.value; + let text = ''; + const children = []; + for (const n of el.childNodes) { + if (n.nodeType === 3) { + text += n.nodeValue; + } else if (n.nodeType === 1) { + const child = serialize(n); + if (child) children.push(child); + } + } + return { tag: el.tagName.toLowerCase(), attrs, text: text.replace(/\\s+/g, ' ').trim(), children }; + } + return { selector: selector, matched: matched, tree: root ? serialize(root) : null }; +})()`; +} + +export interface HtmlNode { + tag: string; + attrs: Record; + text: string; + children: HtmlNode[]; +} + +export interface HtmlTreeResult { + selector: string | null; + matched: number; + tree: HtmlNode | null; +} diff --git a/src/cli.test.ts b/src/cli.test.ts index eb188c653..cd552aa92 100644 --- a/src/cli.test.ts +++ b/src/cli.test.ts @@ -442,6 +442,166 @@ describe('browser network command', () => { }); }); +describe('browser get html command', () => { + const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + function lastLogArg(): unknown { + const calls = consoleLogSpy.mock.calls; + if (calls.length === 0) throw new Error('expected console.log call'); + return calls[calls.length - 1][0]; + } + function lastJsonLog(): any { + const arg = lastLogArg(); + if (typeof arg !== 'string') throw new Error(`expected string arg, got ${typeof arg}`); + return JSON.parse(arg); + } + + beforeEach(() => { + process.exitCode = undefined; + process.env.OPENCLI_CACHE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-html-')); + consoleLogSpy.mockClear(); + mockBrowserConnect.mockClear(); + mockBrowserClose.mockReset().mockResolvedValue(undefined); + + browserState.page = { + setActivePage: vi.fn(), + getActivePage: vi.fn().mockReturnValue('tab-1'), + tabs: vi.fn().mockResolvedValue([{ page: 'tab-1', active: true }]), + evaluate: vi.fn(), + } as unknown as IPage; + }); + + it('returns full outerHTML by default with no truncation', async () => { + const big = '
' + 'x'.repeat(100_000) + '
'; + (browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: big }); + const program = createProgram('', ''); + + await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html']); + + expect(lastLogArg()).toBe(big); + }); + + it('caps output with --max and prepends a visible truncation marker', async () => { + const big = '
' + 'x'.repeat(500) + '
'; + (browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: big }); + const program = createProgram('', ''); + + await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '100']); + + const out = String(lastLogArg()); + expect(out.startsWith('\n${html.slice(0, max)}`); + return; + } + console.log(html); })); addBrowserTabOption(get.command('attributes').argument('', 'Element index').description('Element attributes'))