Skip to content

Commit 6cf5cb2

Browse files
feat(browser): remove silent html truncation, add --as json (#1102)
* feat(browser): remove silent html truncation, add --as json tree output `browser get html` had two agent-hostile defaults: 1. A silent 50000-char cap on the returned HTML — agents that got a truncated page had no signal they were looking at half the DOM. 2. Only raw HTML string output, forcing agents to re-parse for structured extraction. Changes: - Default output is now the full outerHTML, no truncation - `--max <n>` opts in to a character cap; when the cap actually trips, the HTML is prepended with `<!-- opencli: truncated N of M chars; re-run without --max ... -->` so agents always see the signal - `--as json` returns `{selector, matched, tree}` where `tree` is `{tag, attrs, text, children}` recursively. `matched` is the full count of selector matches so agents know when more elements exist beyond the first. `text` is the node's own direct text children, whitespace-collapsed; child elements live in `children`. - `--selector` not matching any element now emits structured `{error:{code:"selector_not_found", ...}}` with a non-zero exit code, in both raw and json modes (was `(empty)` stdout previously, indistinguishable from empty element) - Invalid `--as` / negative `--max` emit structured `invalid_format` / `invalid_max` error codes Extracted the tree serializer as `src/browser/html-tree.ts` so the JS expression can be unit-tested against a DOM stub. * fix(browser get html): structured errors for invalid selector & strict --max Both edges previously bypassed the structured-error contract introduced in #1102, which agents rely on for branching: - Invalid CSS selector: querySelector(All) would throw SyntaxError through page.evaluate into the generic exception path. Wrap the lookup in try/catch inside page context for both raw and --as json paths; surface as {error:{code:"invalid_selector", message}} + non-zero exit. - --max validation: parseInt silently accepted "1.5" -> 1 and "10abc" -> 10. Switch to a strict /^\\d+$/ check so fractional, negative, and non-numeric values all return {error:{code:"invalid_max"}}; validation runs up front so bad values never reach the page. Covered by new unit tests in cli.test.ts (fractional, non-numeric, invalid selector on raw + json) and html-tree.test.ts (SyntaxError -> invalidSelector envelope). Co-authored-by: freemandealer <freeman.zhang1992@gmail.com> --------- Co-authored-by: freemandealer <freeman.zhang1992@gmail.com>
1 parent 7fd8bd6 commit 6cf5cb2

File tree

4 files changed

+441
-3
lines changed

4 files changed

+441
-3
lines changed

src/browser/html-tree.test.ts

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import { describe, expect, it } from 'vitest';
2+
import { buildHtmlTreeJs, type HtmlTreeResult } from './html-tree.js';
3+
4+
/**
5+
* The serializer runs in a page context via `page.evaluate`. In unit tests we
6+
* substitute `document` with a minimal stub that mirrors the DOM surface used
7+
* by the expression, then Function-eval the returned JS.
8+
*/
9+
function runTreeJs(root: unknown, selectorMatches: unknown[], selector: string | null): HtmlTreeResult {
10+
const js = buildHtmlTreeJs({ selector });
11+
const fakeDocument = {
12+
querySelectorAll: () => selectorMatches,
13+
documentElement: root,
14+
};
15+
const fn = new Function('document', `return ${js};`);
16+
return fn(fakeDocument) as HtmlTreeResult;
17+
}
18+
19+
function runTreeJsInvalid(selector: string, errorMessage: string): unknown {
20+
const js = buildHtmlTreeJs({ selector });
21+
const fakeDocument = {
22+
querySelectorAll: () => { const e = new Error(errorMessage); e.name = 'SyntaxError'; throw e; },
23+
documentElement: null,
24+
};
25+
const fn = new Function('document', `return ${js};`);
26+
return fn(fakeDocument);
27+
}
28+
29+
function el(tag: string, attrs: Record<string, string>, children: Array<ChildOf>): FakeEl {
30+
return {
31+
nodeType: 1,
32+
tagName: tag.toUpperCase(),
33+
attributes: Object.entries(attrs).map(([name, value]) => ({ name, value })),
34+
childNodes: children,
35+
};
36+
}
37+
38+
function txt(value: string): FakeText { return { nodeType: 3, nodeValue: value }; }
39+
40+
type FakeEl = { nodeType: 1; tagName: string; attributes: Array<{ name: string; value: string }>; childNodes: Array<ChildOf> };
41+
type FakeText = { nodeType: 3; nodeValue: string };
42+
type ChildOf = FakeEl | FakeText;
43+
44+
describe('buildHtmlTreeJs', () => {
45+
it('serializes a simple element into {tag, attrs, text, children}', () => {
46+
const root = el('div', { class: 'hero', id: 'x' }, [txt('Hello')]);
47+
const result = runTreeJs(root, [root], null);
48+
expect(result.selector).toBeNull();
49+
expect(result.matched).toBe(1);
50+
expect(result.tree).toEqual({
51+
tag: 'div',
52+
attrs: { class: 'hero', id: 'x' },
53+
text: 'Hello',
54+
children: [],
55+
});
56+
});
57+
58+
it('collapses whitespace in direct text content only', () => {
59+
const root = el('p', {}, [
60+
txt(' line \n one '),
61+
el('span', {}, [txt('inner text')]),
62+
txt('\tline two\t'),
63+
]);
64+
const result = runTreeJs(root, [root], null);
65+
expect(result.tree?.text).toBe('line one line two');
66+
expect(result.tree?.children[0].text).toBe('inner text');
67+
});
68+
69+
it('recurses into element children and preserves their attrs', () => {
70+
const root = el('ul', { role: 'list' }, [
71+
el('li', { 'data-id': '1' }, [txt('first')]),
72+
el('li', { 'data-id': '2' }, [txt('second')]),
73+
]);
74+
const result = runTreeJs(root, [root], null);
75+
expect(result.tree?.children).toHaveLength(2);
76+
expect(result.tree?.children[0]).toEqual({
77+
tag: 'li',
78+
attrs: { 'data-id': '1' },
79+
text: 'first',
80+
children: [],
81+
});
82+
});
83+
84+
it('returns matched=N and serializes only the first match', () => {
85+
const first = el('article', { id: 'a' }, [txt('first')]);
86+
const second = el('article', { id: 'b' }, [txt('second')]);
87+
const result = runTreeJs(null, [first, second], 'article');
88+
expect(result.matched).toBe(2);
89+
expect(result.tree?.attrs.id).toBe('a');
90+
});
91+
92+
it('returns tree=null and matched=0 when selector matches nothing', () => {
93+
const result = runTreeJs(null, [], '.nothing');
94+
expect(result.matched).toBe(0);
95+
expect(result.tree).toBeNull();
96+
});
97+
98+
it('catches SyntaxError from querySelectorAll and returns {invalidSelector:true, reason}', () => {
99+
const result = runTreeJsInvalid('##$@@', "'##$@@' is not a valid selector") as {
100+
selector: string;
101+
invalidSelector: boolean;
102+
reason: string;
103+
};
104+
expect(result.invalidSelector).toBe(true);
105+
expect(result.selector).toBe('##$@@');
106+
expect(result.reason).toContain('not a valid selector');
107+
});
108+
});

src/browser/html-tree.ts

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/**
2+
* Client-side HTML → structured tree serializer.
3+
*
4+
* Returned as a JS string that gets passed to `page.evaluate`. The expression
5+
* walks the DOM subtree rooted at the first selector match (or documentElement
6+
* when no selector is given) and emits a compact `{tag, attrs, text, children}`
7+
* tree for agents to consume instead of re-parsing raw HTML.
8+
*
9+
* Text handling: `text` is the concatenated text of direct text children only,
10+
* whitespace-collapsed. Nested element text is left inside `children[].text`.
11+
* Ordering between text and elements is not preserved — agents that need it
12+
* should fall back to raw HTML mode.
13+
*/
14+
15+
export interface BuildHtmlTreeJsOptions {
16+
/** CSS selector to scope the tree; unscoped = documentElement */
17+
selector?: string | null;
18+
}
19+
20+
/**
21+
* Returns a JS expression string. When evaluated in a page context the
22+
* expression resolves to either
23+
* `{selector, matched: number, tree: HtmlNode | null}` on success, or
24+
* `{selector, invalidSelector: true, reason}` when `querySelectorAll`
25+
* throws a `SyntaxError` for an unparseable selector.
26+
*
27+
* Callers must branch on `invalidSelector` to convert it into the CLI's
28+
* `invalid_selector` structured error; otherwise the browser-level exception
29+
* would bubble out of `page.evaluate` and bypass the structured-error
30+
* contract that agents rely on.
31+
*/
32+
export function buildHtmlTreeJs(opts: BuildHtmlTreeJsOptions = {}): string {
33+
const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null';
34+
return `(() => {
35+
const selector = ${selectorLiteral};
36+
let matches;
37+
if (selector) {
38+
try { matches = document.querySelectorAll(selector); }
39+
catch (e) {
40+
return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) };
41+
}
42+
} else {
43+
matches = [document.documentElement];
44+
}
45+
const matched = matches.length;
46+
const root = matches[0] || null;
47+
function serialize(el) {
48+
if (!el || el.nodeType !== 1) return null;
49+
const attrs = {};
50+
for (const a of el.attributes) attrs[a.name] = a.value;
51+
let text = '';
52+
const children = [];
53+
for (const n of el.childNodes) {
54+
if (n.nodeType === 3) {
55+
text += n.nodeValue;
56+
} else if (n.nodeType === 1) {
57+
const child = serialize(n);
58+
if (child) children.push(child);
59+
}
60+
}
61+
return { tag: el.tagName.toLowerCase(), attrs, text: text.replace(/\\s+/g, ' ').trim(), children };
62+
}
63+
return { selector: selector, matched: matched, tree: root ? serialize(root) : null };
64+
})()`;
65+
}
66+
67+
export interface HtmlNode {
68+
tag: string;
69+
attrs: Record<string, string>;
70+
text: string;
71+
children: HtmlNode[];
72+
}
73+
74+
export interface HtmlTreeResult {
75+
selector: string | null;
76+
matched: number;
77+
tree: HtmlNode | null;
78+
}

src/cli.test.ts

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,166 @@ describe('browser network command', () => {
442442
});
443443
});
444444

445+
describe('browser get html command', () => {
446+
const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
447+
448+
function lastLogArg(): unknown {
449+
const calls = consoleLogSpy.mock.calls;
450+
if (calls.length === 0) throw new Error('expected console.log call');
451+
return calls[calls.length - 1][0];
452+
}
453+
function lastJsonLog(): any {
454+
const arg = lastLogArg();
455+
if (typeof arg !== 'string') throw new Error(`expected string arg, got ${typeof arg}`);
456+
return JSON.parse(arg);
457+
}
458+
459+
beforeEach(() => {
460+
process.exitCode = undefined;
461+
process.env.OPENCLI_CACHE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-html-'));
462+
consoleLogSpy.mockClear();
463+
mockBrowserConnect.mockClear();
464+
mockBrowserClose.mockReset().mockResolvedValue(undefined);
465+
466+
browserState.page = {
467+
setActivePage: vi.fn(),
468+
getActivePage: vi.fn().mockReturnValue('tab-1'),
469+
tabs: vi.fn().mockResolvedValue([{ page: 'tab-1', active: true }]),
470+
evaluate: vi.fn(),
471+
} as unknown as IPage;
472+
});
473+
474+
it('returns full outerHTML by default with no truncation', async () => {
475+
const big = '<div>' + 'x'.repeat(100_000) + '</div>';
476+
(browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: big });
477+
const program = createProgram('', '');
478+
479+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html']);
480+
481+
expect(lastLogArg()).toBe(big);
482+
});
483+
484+
it('caps output with --max and prepends a visible truncation marker', async () => {
485+
const big = '<div>' + 'x'.repeat(500) + '</div>';
486+
(browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: big });
487+
const program = createProgram('', '');
488+
489+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '100']);
490+
491+
const out = String(lastLogArg());
492+
expect(out.startsWith('<!-- opencli: truncated 100 of')).toBe(true);
493+
expect(out.length).toBeGreaterThan(100);
494+
expect(out.length).toBeLessThan(big.length);
495+
});
496+
497+
it('rejects negative --max with invalid_max error', async () => {
498+
const program = createProgram('', '');
499+
500+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '-1']);
501+
502+
expect(lastJsonLog().error.code).toBe('invalid_max');
503+
expect(process.exitCode).toBeDefined();
504+
expect(browserState.page!.evaluate).not.toHaveBeenCalled();
505+
});
506+
507+
it('rejects fractional --max with invalid_max error', async () => {
508+
const program = createProgram('', '');
509+
510+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '1.5']);
511+
512+
expect(lastJsonLog().error.code).toBe('invalid_max');
513+
expect(process.exitCode).toBeDefined();
514+
expect(browserState.page!.evaluate).not.toHaveBeenCalled();
515+
});
516+
517+
it('rejects non-numeric --max (e.g. "10abc") with invalid_max error', async () => {
518+
const program = createProgram('', '');
519+
520+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--max', '10abc']);
521+
522+
expect(lastJsonLog().error.code).toBe('invalid_max');
523+
expect(process.exitCode).toBeDefined();
524+
expect(browserState.page!.evaluate).not.toHaveBeenCalled();
525+
});
526+
527+
it('--as json returns structured tree envelope', async () => {
528+
(browserState.page!.evaluate as any).mockResolvedValueOnce({
529+
selector: '.hero',
530+
matched: 1,
531+
tree: { tag: 'div', attrs: { class: 'hero' }, text: 'Hi', children: [] },
532+
});
533+
const program = createProgram('', '');
534+
535+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '.hero', '--as', 'json']);
536+
537+
const out = lastJsonLog();
538+
expect(out.matched).toBe(1);
539+
expect(out.tree.tag).toBe('div');
540+
expect(out.tree.attrs.class).toBe('hero');
541+
});
542+
543+
it('--as json emits selector_not_found when matched is 0', async () => {
544+
(browserState.page!.evaluate as any).mockResolvedValueOnce({ selector: '.missing', matched: 0, tree: null });
545+
const program = createProgram('', '');
546+
547+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '.missing', '--as', 'json']);
548+
549+
expect(lastJsonLog().error.code).toBe('selector_not_found');
550+
expect(process.exitCode).toBeDefined();
551+
});
552+
553+
it('raw mode emits selector_not_found when the selector matches nothing', async () => {
554+
(browserState.page!.evaluate as any).mockResolvedValueOnce({ kind: 'ok', html: null });
555+
const program = createProgram('', '');
556+
557+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '.missing']);
558+
559+
expect(lastJsonLog().error.code).toBe('selector_not_found');
560+
expect(process.exitCode).toBeDefined();
561+
});
562+
563+
it('raw mode emits invalid_selector when the page rejects the selector syntax', async () => {
564+
(browserState.page!.evaluate as any).mockResolvedValueOnce({
565+
kind: 'invalid_selector',
566+
reason: "'##$@@' is not a valid selector",
567+
});
568+
const program = createProgram('', '');
569+
570+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '##$@@']);
571+
572+
const err = lastJsonLog().error;
573+
expect(err.code).toBe('invalid_selector');
574+
expect(err.message).toContain('##$@@');
575+
expect(err.message).toContain('not a valid selector');
576+
expect(process.exitCode).toBeDefined();
577+
});
578+
579+
it('--as json emits invalid_selector when the page rejects the selector syntax', async () => {
580+
(browserState.page!.evaluate as any).mockResolvedValueOnce({
581+
selector: '##$@@',
582+
invalidSelector: true,
583+
reason: "'##$@@' is not a valid selector",
584+
});
585+
const program = createProgram('', '');
586+
587+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--selector', '##$@@', '--as', 'json']);
588+
589+
const err = lastJsonLog().error;
590+
expect(err.code).toBe('invalid_selector');
591+
expect(err.message).toContain('##$@@');
592+
expect(process.exitCode).toBeDefined();
593+
});
594+
595+
it('rejects unknown --as format with invalid_format error', async () => {
596+
const program = createProgram('', '');
597+
598+
await program.parseAsync(['node', 'opencli', 'browser', 'get', 'html', '--as', 'yaml']);
599+
600+
expect(lastJsonLog().error.code).toBe('invalid_format');
601+
expect(process.exitCode).toBeDefined();
602+
});
603+
});
604+
445605
describe('findPackageRoot', () => {
446606
it('walks up from dist/src to the package root', () => {
447607
const packageRoot = path.join('repo-root');

0 commit comments

Comments
 (0)