Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions src/browser/html-tree.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import { describe, expect, it } from 'vitest';
import { buildHtmlTreeJs, type HtmlTreeResult } from './html-tree.js';

/**
* The serializer runs in a page context via `page.evaluate`. In unit tests we
* substitute `document` with a minimal stub that mirrors the DOM surface used
* by the expression, then Function-eval the returned JS.
*/
function runTreeJs(root: unknown, selectorMatches: unknown[], selector: string | null): HtmlTreeResult {
const js = buildHtmlTreeJs({ selector });
const fakeDocument = {
querySelectorAll: () => selectorMatches,
documentElement: root,
};
const fn = new Function('document', `return ${js};`);
return fn(fakeDocument) as HtmlTreeResult;
}

function runTreeJsInvalid(selector: string, errorMessage: string): unknown {
const js = buildHtmlTreeJs({ selector });
const fakeDocument = {
querySelectorAll: () => { const e = new Error(errorMessage); e.name = 'SyntaxError'; throw e; },
documentElement: null,
};
const fn = new Function('document', `return ${js};`);
return fn(fakeDocument);
}

function el(tag: string, attrs: Record<string, string>, children: Array<ChildOf>): FakeEl {
return {
nodeType: 1,
tagName: tag.toUpperCase(),
attributes: Object.entries(attrs).map(([name, value]) => ({ name, value })),
childNodes: children,
};
}

function txt(value: string): FakeText { return { nodeType: 3, nodeValue: value }; }

type FakeEl = { nodeType: 1; tagName: string; attributes: Array<{ name: string; value: string }>; childNodes: Array<ChildOf> };
type FakeText = { nodeType: 3; nodeValue: string };
type ChildOf = FakeEl | FakeText;

describe('buildHtmlTreeJs', () => {
it('serializes a simple element into {tag, attrs, text, children}', () => {
const root = el('div', { class: 'hero', id: 'x' }, [txt('Hello')]);
const result = runTreeJs(root, [root], null);
expect(result.selector).toBeNull();
expect(result.matched).toBe(1);
expect(result.tree).toEqual({
tag: 'div',
attrs: { class: 'hero', id: 'x' },
text: 'Hello',
children: [],
});
});

it('collapses whitespace in direct text content only', () => {
const root = el('p', {}, [
txt(' line \n one '),
el('span', {}, [txt('inner text')]),
txt('\tline two\t'),
]);
const result = runTreeJs(root, [root], null);
expect(result.tree?.text).toBe('line one line two');
expect(result.tree?.children[0].text).toBe('inner text');
});

it('recurses into element children and preserves their attrs', () => {
const root = el('ul', { role: 'list' }, [
el('li', { 'data-id': '1' }, [txt('first')]),
el('li', { 'data-id': '2' }, [txt('second')]),
]);
const result = runTreeJs(root, [root], null);
expect(result.tree?.children).toHaveLength(2);
expect(result.tree?.children[0]).toEqual({
tag: 'li',
attrs: { 'data-id': '1' },
text: 'first',
children: [],
});
});

it('returns matched=N and serializes only the first match', () => {
const first = el('article', { id: 'a' }, [txt('first')]);
const second = el('article', { id: 'b' }, [txt('second')]);
const result = runTreeJs(null, [first, second], 'article');
expect(result.matched).toBe(2);
expect(result.tree?.attrs.id).toBe('a');
});

it('returns tree=null and matched=0 when selector matches nothing', () => {
const result = runTreeJs(null, [], '.nothing');
expect(result.matched).toBe(0);
expect(result.tree).toBeNull();
});

it('catches SyntaxError from querySelectorAll and returns {invalidSelector:true, reason}', () => {
const result = runTreeJsInvalid('##$@@', "'##$@@' is not a valid selector") as {
selector: string;
invalidSelector: boolean;
reason: string;
};
expect(result.invalidSelector).toBe(true);
expect(result.selector).toBe('##$@@');
expect(result.reason).toContain('not a valid selector');
});
});
78 changes: 78 additions & 0 deletions src/browser/html-tree.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* Client-side HTML → structured tree serializer.
*
* Returned as a JS string that gets passed to `page.evaluate`. The expression
* walks the DOM subtree rooted at the first selector match (or documentElement
* when no selector is given) and emits a compact `{tag, attrs, text, children}`
* tree for agents to consume instead of re-parsing raw HTML.
*
* Text handling: `text` is the concatenated text of direct text children only,
* whitespace-collapsed. Nested element text is left inside `children[].text`.
* Ordering between text and elements is not preserved — agents that need it
* should fall back to raw HTML mode.
*/

export interface BuildHtmlTreeJsOptions {
/** CSS selector to scope the tree; unscoped = documentElement */
selector?: string | null;
}

/**
* Returns a JS expression string. When evaluated in a page context the
* expression resolves to either
* `{selector, matched: number, tree: HtmlNode | null}` on success, or
* `{selector, invalidSelector: true, reason}` when `querySelectorAll`
* throws a `SyntaxError` for an unparseable selector.
*
* Callers must branch on `invalidSelector` to convert it into the CLI's
* `invalid_selector` structured error; otherwise the browser-level exception
* would bubble out of `page.evaluate` and bypass the structured-error
* contract that agents rely on.
*/
export function buildHtmlTreeJs(opts: BuildHtmlTreeJsOptions = {}): string {
const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null';
return `(() => {
const selector = ${selectorLiteral};
let matches;
if (selector) {
try { matches = document.querySelectorAll(selector); }
catch (e) {
return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) };
}
} else {
matches = [document.documentElement];
}
const matched = matches.length;
const root = matches[0] || null;
function serialize(el) {
if (!el || el.nodeType !== 1) return null;
const attrs = {};
for (const a of el.attributes) attrs[a.name] = a.value;
let text = '';
const children = [];
for (const n of el.childNodes) {
if (n.nodeType === 3) {
text += n.nodeValue;
} else if (n.nodeType === 1) {
const child = serialize(n);
if (child) children.push(child);
}
}
return { tag: el.tagName.toLowerCase(), attrs, text: text.replace(/\\s+/g, ' ').trim(), children };
}
return { selector: selector, matched: matched, tree: root ? serialize(root) : null };
})()`;
}

export interface HtmlNode {
tag: string;
attrs: Record<string, string>;
text: string;
children: HtmlNode[];
}

export interface HtmlTreeResult {
selector: string | null;
matched: number;
tree: HtmlNode | null;
}
160 changes: 160 additions & 0 deletions src/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,166 @@ describe('browser network command', () => {
});
});

describe('browser get html command', () => {
const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => {});

function lastLogArg(): unknown {
const calls = consoleLogSpy.mock.calls;
if (calls.length === 0) throw new Error('expected console.log call');
return calls[calls.length - 1][0];
}
function lastJsonLog(): any {
const arg = lastLogArg();
if (typeof arg !== 'string') throw new Error(`expected string arg, got ${typeof arg}`);
return JSON.parse(arg);
}

beforeEach(() => {
process.exitCode = undefined;
process.env.OPENCLI_CACHE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-html-'));
consoleLogSpy.mockClear();
mockBrowserConnect.mockClear();
mockBrowserClose.mockReset().mockResolvedValue(undefined);

browserState.page = {
setActivePage: vi.fn(),
getActivePage: vi.fn().mockReturnValue('tab-1'),
tabs: vi.fn().mockResolvedValue([{ page: 'tab-1', active: true }]),
evaluate: vi.fn(),
} as unknown as IPage;
});

it('returns full outerHTML by default with no truncation', async () => {
const big = '<div>' + 'x'.repeat(100_000) + '</div>';
(browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: big });
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html']);

expect(lastLogArg()).toBe(big);
});

it('caps output with --max and prepends a visible truncation marker', async () => {
const big = '<div>' + 'x'.repeat(500) + '</div>';
(browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: big });
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '100']);

const out = String(lastLogArg());
expect(out.startsWith('<!-- opencli: truncated 100 of')).toBe(true);
expect(out.length).toBeGreaterThan(100);
expect(out.length).toBeLessThan(big.length);
});

it('rejects negative --max with invalid_max error', async () => {
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '-1']);

expect(lastJsonLog().error.code).toBe('invalid_max');
expect(process.exitCode).toBeDefined();
expect(browserState.page!.evaluate).not.toHaveBeenCalled();
});

it('rejects fractional --max with invalid_max error', async () => {
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '1.5']);

expect(lastJsonLog().error.code).toBe('invalid_max');
expect(process.exitCode).toBeDefined();
expect(browserState.page!.evaluate).not.toHaveBeenCalled();
});

it('rejects non-numeric --max (e.g. "10abc") with invalid_max error', async () => {
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '10abc']);

expect(lastJsonLog().error.code).toBe('invalid_max');
expect(process.exitCode).toBeDefined();
expect(browserState.page!.evaluate).not.toHaveBeenCalled();
});

it('--as json returns structured tree envelope', async () => {
(browserState.page!.evaluate as any).mockResolvedValueOnce({
selector: '.hero',
matched: 1,
tree: { tag: 'div', attrs: { class: 'hero' }, text: 'Hi', children: [] },
});
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '.hero', '--as', 'json']);

const out = lastJsonLog();
expect(out.matched).toBe(1);
expect(out.tree.tag).toBe('div');
expect(out.tree.attrs.class).toBe('hero');
});

it('--as json emits selector_not_found when matched is 0', async () => {
(browserState.page!.evaluate as any).mockResolvedValueOnce({ selector: '.missing', matched: 0, tree: null });
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '.missing', '--as', 'json']);

expect(lastJsonLog().error.code).toBe('selector_not_found');
expect(process.exitCode).toBeDefined();
});

it('raw mode emits selector_not_found when the selector matches nothing', async () => {
(browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: null });
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '.missing']);

expect(lastJsonLog().error.code).toBe('selector_not_found');
expect(process.exitCode).toBeDefined();
});

it('raw mode emits invalid_selector when the page rejects the selector syntax', async () => {
(browserState.page!.evaluate as any).mockResolvedValueOnce({
kind: 'invalid_selector',
reason: "'##$@@' is not a valid selector",
});
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '##$@@']);

const err = lastJsonLog().error;
expect(err.code).toBe('invalid_selector');
expect(err.message).toContain('##$@@');
expect(err.message).toContain('not a valid selector');
expect(process.exitCode).toBeDefined();
});

it('--as json emits invalid_selector when the page rejects the selector syntax', async () => {
(browserState.page!.evaluate as any).mockResolvedValueOnce({
selector: '##$@@',
invalidSelector: true,
reason: "'##$@@' is not a valid selector",
});
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '##$@@', '--as', 'json']);

const err = lastJsonLog().error;
expect(err.code).toBe('invalid_selector');
expect(err.message).toContain('##$@@');
expect(process.exitCode).toBeDefined();
});

it('rejects unknown --as format with invalid_format error', async () => {
const program = createProgram('', '');

await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--as', 'yaml']);

expect(lastJsonLog().error.code).toBe('invalid_format');
expect(process.exitCode).toBeDefined();
});
});

describe('findPackageRoot', () => {
it('walks up from dist/src to the package root', () => {
const packageRoot = path.join('repo-root');
Expand Down
Loading
Loading