Skip to content

Commit e37dd51

Browse files
authored
Merge pull request #9 from anil-bd/fix/output-csv-html-md-serializers-and-xlsx-rejection
fix(output): wire CSV/HTML/MD serializers, reject XLSX with helpful error
2 parents 739f7e0 + ec4ab4e commit e37dd51

2 files changed

Lines changed: 277 additions & 1 deletion

File tree

src/__tests__/utils/output.test.ts

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import {describe, it, expect, vi, beforeEach, afterEach} from 'vitest';
2+
import fs from 'fs';
3+
import path from 'path';
4+
import os from 'os';
5+
import {serialize, format_from_ext, print} from '../../utils/output';
6+
7+
describe('utils/output.serialize csv', ()=>{
8+
it('serializes array of flat objects as RFC 4180 CSV with header row', ()=>{
9+
const rows = [
10+
{url: 'https://a.test/1', title: 'A', price: 1.5},
11+
{url: 'https://a.test/2', title: 'B', price: 2.0},
12+
];
13+
const out = serialize(rows, 'csv');
14+
const lines = out.trim().split('\n');
15+
expect(lines[0]).toBe('url,title,price');
16+
expect(lines[1]).toBe('https://a.test/1,A,1.5');
17+
expect(lines[2]).toBe('https://a.test/2,B,2');
18+
});
19+
20+
it('quotes and escapes embedded commas, quotes, and newlines', ()=>{
21+
const rows = [{name: 'Smith, John', note: 'He said "hi"'},
22+
{name: 'multi\nline', note: 'ok'}];
23+
const out = serialize(rows, 'csv');
24+
const lines = out.trim().split(/\n/);
25+
expect(lines[0]).toBe('name,note');
26+
expect(lines[1]).toBe('"Smith, John","He said ""hi"""');
27+
});
28+
29+
it('unions keys across heterogeneous rows', ()=>{
30+
const rows = [{a: 1, b: 2}, {a: 3, c: 4}];
31+
const out = serialize(rows, 'csv');
32+
const lines = out.trim().split('\n');
33+
expect(lines[0]).toBe('a,b,c');
34+
expect(lines[1]).toBe('1,2,');
35+
expect(lines[2]).toBe('3,,4');
36+
});
37+
38+
it('wraps a single object as one CSV row', ()=>{
39+
const out = serialize({a: 1, b: 'x'}, 'csv');
40+
expect(out.trim()).toBe('a,b\n1,x');
41+
});
42+
43+
it('serializes nested values via JSON', ()=>{
44+
const rows = [{id: 1, meta: {tag: 'x'}}];
45+
const out = serialize(rows, 'csv');
46+
const lines = out.trim().split('\n');
47+
expect(lines[1]).toBe('1,"{""tag"":""x""}"');
48+
});
49+
});
50+
51+
describe('utils/output.serialize markdown', ()=>{
52+
it('renders an array of objects as a Markdown table', ()=>{
53+
const rows = [{a: 1, b: 'x'}, {a: 2, b: 'y'}];
54+
const out = serialize(rows, 'markdown');
55+
expect(out).toContain('| a | b |');
56+
expect(out).toContain('| --- | --- |');
57+
expect(out).toContain('| 1 | x |');
58+
expect(out).toContain('| 2 | y |');
59+
});
60+
61+
it('escapes pipes and newlines inside cells', ()=>{
62+
const rows = [{a: 'a|b', b: 'line1\nline2'}];
63+
const out = serialize(rows, 'markdown');
64+
expect(out).toContain('| a\\|b | line1 line2 |');
65+
});
66+
67+
it('falls back to a fenced JSON block for non-tabular data', ()=>{
68+
const out = serialize([1, 2, 3], 'markdown');
69+
expect(out.startsWith('```json')).toBe(true);
70+
});
71+
});
72+
73+
describe('utils/output.serialize html', ()=>{
74+
it('renders an array of objects as an HTML table', ()=>{
75+
const rows = [{a: 1, b: '<x>'}];
76+
const out = serialize(rows, 'html');
77+
expect(out).toContain('<thead><tr><th>a</th><th>b</th></tr></thead>');
78+
expect(out).toContain('<td>1</td><td>&lt;x&gt;</td>');
79+
});
80+
81+
it('escapes HTML in non-tabular fallback', ()=>{
82+
const out = serialize('<script>', 'html');
83+
expect(out).toBe('<script>');
84+
});
85+
});
86+
87+
describe('utils/output.format_from_ext', ()=>{
88+
it('maps known extensions', ()=>{
89+
expect(format_from_ext('a.json')).toBe('json');
90+
expect(format_from_ext('a.CSV')).toBe('csv');
91+
expect(format_from_ext('a.md')).toBe('markdown');
92+
expect(format_from_ext('a.html')).toBe('html');
93+
});
94+
95+
it('returns null for unknown extensions', ()=>{
96+
expect(format_from_ext('a.txt')).toBeNull();
97+
expect(format_from_ext('noext')).toBeNull();
98+
});
99+
100+
it('rejects .xlsx with a helpful message and exits 1', ()=>{
101+
const exit = vi.spyOn(process, 'exit').mockImplementation(
102+
((_code?: number)=>{ throw new Error('exit'); }) as never);
103+
const err = vi.spyOn(console, 'error').mockImplementation(()=>{});
104+
expect(()=>format_from_ext('out.xlsx')).toThrow('exit');
105+
const msg = err.mock.calls.map(c=>c.join(' ')).join(' ');
106+
expect(msg).toMatch(/XLSX output is not supported/);
107+
expect(msg).toMatch(/--pretty -o file\.json/);
108+
expect(msg).toMatch(/brightdata\.com\/cp\/scrapers/);
109+
exit.mockRestore();
110+
err.mockRestore();
111+
});
112+
});
113+
114+
describe('utils/output.print writes correct format from extension', ()=>{
115+
const tmp_files: string[] = [];
116+
const make_tmp = (ext: string)=>{
117+
const p = path.join(os.tmpdir(),
118+
`bdata-output-test-${Date.now()}-${Math.random()}${ext}`);
119+
tmp_files.push(p);
120+
return p;
121+
};
122+
beforeEach(()=>{ vi.spyOn(console, 'error').mockImplementation(()=>{}); });
123+
afterEach(()=>{
124+
vi.restoreAllMocks();
125+
for (const f of tmp_files) { try { fs.unlinkSync(f); } catch {} }
126+
});
127+
128+
it('-o file.csv writes CSV (regression: was silently writing JSON)', ()=>{
129+
const out = make_tmp('.csv');
130+
print([{url: 'https://x.test', title: 'T'}], {output: out});
131+
const content = fs.readFileSync(out, 'utf8');
132+
expect(content.split('\n')[0]).toBe('url,title');
133+
expect(content.split('\n')[1]).toBe('https://x.test,T');
134+
});
135+
136+
it('-o file.html writes HTML (regression: was silently writing JSON)', ()=>{
137+
const out = make_tmp('.html');
138+
print([{a: 1}], {output: out});
139+
const content = fs.readFileSync(out, 'utf8');
140+
expect(content).toContain('<table>');
141+
});
142+
143+
it('-o file.md writes Markdown (regression: was silently writing JSON)', ()=>{
144+
const out = make_tmp('.md');
145+
print([{a: 1}], {output: out});
146+
const content = fs.readFileSync(out, 'utf8');
147+
expect(content).toContain('| a |');
148+
});
149+
150+
it('-o file.json writes JSON unchanged', ()=>{
151+
const out = make_tmp('.json');
152+
print([{a: 1}], {output: out});
153+
const content = fs.readFileSync(out, 'utf8');
154+
expect(JSON.parse(content)).toEqual([{a: 1}]);
155+
});
156+
});

src/utils/output.ts

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,24 @@ const dim = (s: string)=>ansi('2', s);
1414
const success = (msg: string)=>console.error(green(`✓ ${msg}`));
1515
const warn = (msg: string)=>console.error(yellow(`⚠ ${msg}`));
1616
const info = (msg: string)=>console.error(dim(msg));
17-
const fail = (msg: string)=>{ console.error(red(`✗ ${msg}`));
17+
const fail = (msg: string)=>{ console.error(red(`✗ ${msg}`));
1818
process.exit(1); };
1919

2020
type Output_format = 'markdown'|'json'|'pretty'|'html'|'csv'|'raw';
2121

22+
const UNSUPPORTED_EXTS: Record<string, string> = {
23+
'.xlsx': 'XLSX output is not supported. Use --pretty -o file.json '
24+
+'and convert with a tool like xlsx-cli, or download as XLSX '
25+
+'from the Bright Data web UI (https://brightdata.com/cp/scrapers).',
26+
'.xls': 'XLS output is not supported. Use --pretty -o file.json '
27+
+'and convert with a tool like xlsx-cli, or download from the '
28+
+'Bright Data web UI (https://brightdata.com/cp/scrapers).',
29+
};
30+
2231
const format_from_ext = (file_path: string): Output_format|null=>{
2332
const ext = path.extname(file_path).toLowerCase();
33+
if (UNSUPPORTED_EXTS[ext])
34+
fail(UNSUPPORTED_EXTS[ext]);
2435
if (ext == '.json') return 'json';
2536
if (ext == '.md') return 'markdown';
2637
if (ext == '.html') return 'html';
@@ -35,11 +46,120 @@ type Print_opts = {
3546
format?: Output_format;
3647
};
3748

49+
const to_rows = (data: unknown): Record<string, unknown>[]|null=>{
50+
if (Array.isArray(data) && data.length
51+
&& data.every(d=>d && typeof d == 'object' && !Array.isArray(d)))
52+
{
53+
return data as Record<string, unknown>[];
54+
}
55+
if (data && typeof data == 'object' && !Array.isArray(data))
56+
return [data as Record<string, unknown>];
57+
return null;
58+
};
59+
60+
const collect_keys = (rows: Record<string, unknown>[]): string[]=>{
61+
const seen = new Set<string>();
62+
const ordered: string[] = [];
63+
for (const r of rows)
64+
{
65+
for (const k of Object.keys(r))
66+
{
67+
if (!seen.has(k))
68+
{
69+
seen.add(k);
70+
ordered.push(k);
71+
}
72+
}
73+
}
74+
return ordered;
75+
};
76+
77+
const cell_to_string = (val: unknown): string=>{
78+
if (val === null || val === undefined)
79+
return '';
80+
if (typeof val == 'string')
81+
return val;
82+
if (typeof val == 'number' || typeof val == 'boolean')
83+
return String(val);
84+
return JSON.stringify(val);
85+
};
86+
87+
const csv_escape = (val: unknown): string=>{
88+
const s = cell_to_string(val);
89+
if (/[",\r\n]/.test(s))
90+
return '"'+s.replace(/"/g, '""')+'"';
91+
return s;
92+
};
93+
94+
const serialize_csv = (data: unknown): string=>{
95+
if (typeof data == 'string')
96+
return data;
97+
const rows = to_rows(data);
98+
if (!rows)
99+
{
100+
warn('CSV requires an object or array of objects; falling back '
101+
+'to JSON. Use --json to silence this warning.');
102+
return JSON.stringify(data, null, 2);
103+
}
104+
const keys = collect_keys(rows);
105+
const header = keys.map(csv_escape).join(',');
106+
const body = rows.map(r=>keys.map(k=>csv_escape(r[k])).join(',')).join('\n');
107+
return header+'\n'+body+'\n';
108+
};
109+
110+
const md_escape = (val: unknown): string=>
111+
cell_to_string(val).replace(/\|/g, '\\|').replace(/\r?\n/g, ' ');
112+
113+
const serialize_markdown = (data: unknown): string=>{
114+
if (typeof data == 'string')
115+
return data;
116+
const rows = to_rows(data);
117+
if (!rows)
118+
return '```json\n'+JSON.stringify(data, null, 2)+'\n```\n';
119+
const keys = collect_keys(rows);
120+
const header = '| '+keys.join(' | ')+' |';
121+
const divider = '| '+keys.map(()=>'---').join(' | ')+' |';
122+
const body = rows.map(r=>
123+
'| '+keys.map(k=>md_escape(r[k])).join(' | ')+' |').join('\n');
124+
return [header, divider, body].join('\n')+'\n';
125+
};
126+
127+
const html_escape = (val: unknown): string=>
128+
cell_to_string(val)
129+
.replace(/&/g, '&amp;')
130+
.replace(/</g, '&lt;')
131+
.replace(/>/g, '&gt;')
132+
.replace(/"/g, '&quot;');
133+
134+
const serialize_html = (data: unknown): string=>{
135+
if (typeof data == 'string')
136+
return data;
137+
const rows = to_rows(data);
138+
if (!rows)
139+
return '<pre>'+html_escape(JSON.stringify(data, null, 2))+'</pre>\n';
140+
const keys = collect_keys(rows);
141+
const thead = '<thead><tr>'
142+
+keys.map(k=>'<th>'+html_escape(k)+'</th>').join('')
143+
+'</tr></thead>';
144+
const tbody = '<tbody>'
145+
+rows.map(r=>'<tr>'
146+
+keys.map(k=>'<td>'+html_escape(r[k])+'</td>').join('')
147+
+'</tr>').join('')
148+
+'</tbody>';
149+
return '<table>'+thead+tbody+'</table>\n';
150+
};
151+
38152
const serialize = (data: unknown, fmt: Output_format): string=>{
39153
if (fmt == 'pretty')
40154
return JSON.stringify(data, null, 2);
41155
if (fmt == 'json')
42156
return JSON.stringify(data);
157+
if (fmt == 'csv')
158+
return serialize_csv(data);
159+
if (fmt == 'markdown')
160+
return serialize_markdown(data);
161+
if (fmt == 'html')
162+
return serialize_html(data);
43163
if (typeof data == 'string')
44164
return data;
45165
return JSON.stringify(data, null, 2);

0 commit comments

Comments
 (0)