Skip to content

Commit 648390e

Browse files
authored
feat(web,download): absorb #1048 — video/audio/iframe + --stdout (#1146)
* feat(web,download): absorb #1048 media + --stdout into web read Distill the useful pieces of the abandoned PR #1048 (`web md`) into the existing shared pipeline instead of introducing a parallel command: - Turndown rules for <video> / <audio> / <iframe>. Video and audio are emitted as inline HTML so renderers that support it keep playback, and iframes degrade to markdown links (title + src) so embedded content (YouTube, CodePen, …) stays reachable. `iframe` moves out of STRIPPED_TAGS since it's now handled explicitly. - `stdout` option on ArticleDownloadOptions: writes the full markdown to process.stdout, skips image download + mkdir + file write, and reports saved='-'. Remote image URLs stay intact so piped output is self-contained. - `web read --stdout` wires the above through. - Lazy-load src rewrite: the extractor now promotes data-src / data-original / data-lazy-src / data-srcset onto `src` before the HTML is frozen, so the markdown body and the image-download list reference the same URL (previously a page with placeholder.gif + data-src produced broken image links in the output). Nothing in #1048 that overlapped with the already-merged #1143 hardening was kept — no new Readability wiring, no duplicate Turndown config, no new command. * fix(web): keep stdout streaming output clean * fix(tests): update iframe e2e assertion and drop relative src import - article-extract e2e fixture test: iframe now converts to a markdown link instead of being stripped, so assert the YouTube embed link survives rather than asserting its absence. - clis/web/read.test.js: replace vi.importActual('../../src/registry.js') with a direct __test__.command export from read.js; the relative import into src/ tripped the package-exports adapter guardrail.
1 parent 733ac07 commit 648390e

6 files changed

Lines changed: 301 additions & 18 deletions

File tree

cli-manifest.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17032,6 +17032,13 @@
1703217032
"default": 3,
1703317033
"required": false,
1703417034
"help": "Seconds to wait after page load"
17035+
},
17036+
{
17037+
"name": "stdout",
17038+
"type": "boolean",
17039+
"default": false,
17040+
"required": false,
17041+
"help": "Print markdown to stdout instead of saving to a file"
1703517042
}
1703617043
],
1703717044
"columns": [

clis/web/read.js

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*/
1616
import { cli, Strategy } from '@jackwener/opencli/registry';
1717
import { downloadArticle } from '@jackwener/opencli/download/article-download';
18-
cli({
18+
const command = cli({
1919
site: 'web',
2020
name: 'read',
2121
description: 'Fetch any web page and export as Markdown',
@@ -26,6 +26,7 @@ cli({
2626
{ name: 'output', default: './web-articles', help: 'Output directory' },
2727
{ name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' },
2828
{ name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
29+
{ name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' },
2930
],
3031
columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'],
3132
func: async (page, kwargs) => {
@@ -162,14 +163,26 @@ cli({
162163
if (el.children && el.children.length > 2) dedup(el);
163164
});
164165
166+
// --- Lazy-load image src rewrite ---
167+
// Many sites render <img src="placeholder.gif" data-src="real.jpg">.
168+
// Promote the real URL onto src so both the markdown body and the
169+
// image download list reference the same URL.
170+
clone.querySelectorAll('img').forEach(img => {
171+
const srcset = img.getAttribute('data-srcset') || '';
172+
const srcsetFirst = srcset.split(',')[0]?.trim().split(' ')[0] || '';
173+
const real = img.getAttribute('data-src')
174+
|| img.getAttribute('data-original')
175+
|| img.getAttribute('data-lazy-src')
176+
|| srcsetFirst;
177+
if (real) img.setAttribute('src', real);
178+
});
179+
165180
result.contentHtml = clone.innerHTML;
166181
167182
// --- Image extraction ---
168183
const seen = new Set();
169184
clone.querySelectorAll('img').forEach(img => {
170-
const src = img.getAttribute('data-src')
171-
|| img.getAttribute('data-original')
172-
|| img.getAttribute('src');
185+
const src = img.getAttribute('src') || '';
173186
if (src && !src.startsWith('data:') && !seen.has(src)) {
174187
seen.add(src);
175188
result.imageUrls.push(src);
@@ -186,7 +199,7 @@ cli({
186199
referer = parsed.origin + '/';
187200
}
188201
catch { /* ignore */ }
189-
return downloadArticle({
202+
const result = await downloadArticle({
190203
title: data?.title || 'untitled',
191204
author: data?.author,
192205
publishTime: data?.publishTime,
@@ -197,6 +210,13 @@ cli({
197210
output: kwargs.output,
198211
downloadImages: kwargs['download-images'],
199212
imageHeaders: referer ? { Referer: referer } : undefined,
213+
stdout: kwargs.stdout,
200214
});
215+
// `--stdout` is a content-streaming mode. The markdown body already went
216+
// to process.stdout inside downloadArticle(), so returning rows here
217+
// would make Commander append table/JSON output to the same stdout
218+
// stream and break piping.
219+
return kwargs.stdout ? null : result;
201220
},
202221
});
222+
export const __test__ = { command };

clis/web/read.test.js

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import { beforeEach, describe, expect, it, vi } from 'vitest';
2+
3+
const { mockDownloadArticle } = vi.hoisted(() => ({
4+
mockDownloadArticle: vi.fn(),
5+
}));
6+
7+
vi.mock('@jackwener/opencli/download/article-download', () => ({
8+
downloadArticle: mockDownloadArticle,
9+
}));
10+
11+
const { __test__ } = await import('./read.js');
12+
13+
describe('web/read stdout behavior', () => {
14+
const read = __test__.command;
15+
const page = {
16+
goto: vi.fn().mockResolvedValue(undefined),
17+
wait: vi.fn().mockResolvedValue(undefined),
18+
evaluate: vi.fn().mockResolvedValue({
19+
title: 'Example Article',
20+
author: 'Author',
21+
publishTime: '2026-04-22',
22+
contentHtml: '<p>hello</p>',
23+
imageUrls: ['https://example.com/a.jpg'],
24+
}),
25+
};
26+
27+
beforeEach(() => {
28+
mockDownloadArticle.mockReset();
29+
mockDownloadArticle.mockResolvedValue([{
30+
title: 'Example Article',
31+
author: 'Author',
32+
publish_time: '2026-04-22',
33+
status: 'success',
34+
size: '1 KB',
35+
saved: '-',
36+
}]);
37+
page.goto.mockClear();
38+
page.wait.mockClear();
39+
page.evaluate.mockClear();
40+
});
41+
42+
it('returns null in --stdout mode so the CLI does not append result rows to stdout', async () => {
43+
const result = await read.func(page, {
44+
url: 'https://example.com/article',
45+
output: '/tmp/out',
46+
'download-images': false,
47+
stdout: true,
48+
});
49+
50+
expect(result).toBeNull();
51+
expect(mockDownloadArticle).toHaveBeenCalledWith(
52+
expect.objectContaining({
53+
title: 'Example Article',
54+
sourceUrl: 'https://example.com/article',
55+
}),
56+
expect.objectContaining({
57+
output: '/tmp/out',
58+
stdout: true,
59+
}),
60+
);
61+
});
62+
63+
it('still returns the saved-row payload when writing to disk', async () => {
64+
const rows = [{ title: 'Example Article', saved: '/tmp/out/Example Article/example.md' }];
65+
mockDownloadArticle.mockResolvedValue(rows);
66+
67+
const result = await read.func(page, {
68+
url: 'https://example.com/article',
69+
output: '/tmp/out',
70+
'download-images': false,
71+
stdout: false,
72+
});
73+
74+
expect(result).toBe(rows);
75+
});
76+
});

src/browser/article-extract.e2e.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ describe('article extract → markdown e2e fixtures', () => {
8181
expect(md).not.toContain('Standard file extension');
8282
});
8383

84-
it('extracts a Deno blog fixture and strips embedded iframe chrome from markdown', async () => {
84+
it('extracts a Deno blog fixture, preserves embedded iframes as markdown links, and drops page chrome', async () => {
8585
const url = 'https://deno.com/blog/v2.0';
8686
const article = runExtract(loadFixture('deno-v2.html'), url);
8787
expect(article?.source).toBe('readability');
@@ -91,7 +91,7 @@ describe('article extract → markdown e2e fixtures', () => {
9191
const md = await renderMarkdown(article, url);
9292
expect(md).toContain('## Announcing Deno 2');
9393
expect(md).toContain('The web is humanity’s largest software platform');
94-
expect(md).not.toContain('youtube.com/embed');
94+
expect(md).toMatch(/\]\(https:\/\/www\.youtube(?:-nocookie)?\.com\/embed\/[^)]+\)/);
9595
expect(md).not.toContain('Skip to main content');
9696
});
9797

src/download/article-download.test.ts

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,22 @@ describe('downloadArticle', () => {
8080
expect(md).toContain('[ ] todo');
8181
});
8282

83-
it('strips script / style / noscript / iframe / form', async () => {
83+
it('strips script / style / noscript / form but keeps iframe as a link', async () => {
8484
const md = await runAndRead(
8585
'<p>keep</p>' +
8686
'<script>alert(1)</script>' +
8787
'<style>.x{color:red}</style>' +
8888
'<noscript>nojs</noscript>' +
89-
'<iframe src="x"></iframe>' +
89+
'<iframe src="https://www.youtube.com/embed/abc" title="Demo video"></iframe>' +
9090
'<form><button>click</button></form>',
9191
);
9292
expect(md).toContain('keep');
9393
expect(md).not.toContain('alert');
9494
expect(md).not.toContain('color:red');
9595
expect(md).not.toContain('nojs');
9696
expect(md).not.toContain('click');
97+
// Iframe degrades to a link preserving the embedded URL.
98+
expect(md).toContain('[Demo video](https://www.youtube.com/embed/abc)');
9799
});
98100

99101
it('strips SVG nodes entirely', async () => {
@@ -176,5 +178,123 @@ describe('downloadArticle', () => {
176178
expect(md).toContain('also-keep');
177179
expect(md).not.toContain('strip-me');
178180
});
181+
182+
it('preserves <video> as inline HTML with src + poster', async () => {
183+
const md = await runAndRead(
184+
'<p>before</p>' +
185+
'<video src="https://cdn.example.com/clip.mp4" poster="https://cdn.example.com/poster.jpg"></video>' +
186+
'<p>after</p>',
187+
);
188+
expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls poster="https://cdn.example.com/poster.jpg"></video>');
189+
expect(md).toContain('before');
190+
expect(md).toContain('after');
191+
});
192+
193+
it('falls back to <source> inside <video> when src attribute is absent', async () => {
194+
const md = await runAndRead(
195+
'<video><source src="https://cdn.example.com/clip.mp4" type="video/mp4"></video>',
196+
);
197+
expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls></video>');
198+
});
199+
200+
it('drops <video> with no src and no <source>', async () => {
201+
const md = await runAndRead('<p>before</p><video></video><p>after</p>');
202+
expect(md).not.toContain('<video');
203+
expect(md).toContain('before');
204+
expect(md).toContain('after');
205+
});
206+
207+
it('preserves <audio> as inline HTML', async () => {
208+
const md = await runAndRead(
209+
'<audio src="https://cdn.example.com/podcast.mp3"></audio>',
210+
);
211+
expect(md).toContain('<audio src="https://cdn.example.com/podcast.mp3" controls></audio>');
212+
});
213+
214+
it('degrades <iframe> to a markdown link with title', async () => {
215+
const md = await runAndRead(
216+
'<iframe src="https://codepen.io/pen/abc" title="Live demo"></iframe>',
217+
);
218+
expect(md).toContain('[Live demo](https://codepen.io/pen/abc)');
219+
});
220+
221+
it('defaults iframe title to "Embedded content" when missing', async () => {
222+
const md = await runAndRead(
223+
'<iframe src="https://example.com/embed"></iframe>',
224+
);
225+
expect(md).toContain('[Embedded content](https://example.com/embed)');
226+
});
227+
228+
it('drops <iframe> with no src', async () => {
229+
const md = await runAndRead('<p>before</p><iframe></iframe><p>after</p>');
230+
expect(md).not.toContain('iframe');
231+
expect(md).toContain('before');
232+
expect(md).toContain('after');
233+
});
234+
});
235+
236+
describe('stdout mode', () => {
237+
it('writes markdown to process.stdout and skips file write', async () => {
238+
const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
239+
tempDirs.push(tempDir);
240+
241+
const chunks: string[] = [];
242+
const originalWrite = process.stdout.write.bind(process.stdout);
243+
process.stdout.write = ((chunk: string | Uint8Array): boolean => {
244+
chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
245+
return true;
246+
}) as typeof process.stdout.write;
247+
248+
try {
249+
const result = await downloadArticle({
250+
title: 'Piped',
251+
contentHtml: '<p>Streaming body</p>',
252+
sourceUrl: 'https://example.com/a',
253+
}, {
254+
output: tempDir,
255+
stdout: true,
256+
});
257+
258+
expect(result[0].status).toBe('success');
259+
expect(result[0].saved).toBe('-');
260+
expect(fs.readdirSync(tempDir)).toHaveLength(0);
261+
262+
const emitted = chunks.join('');
263+
expect(emitted).toContain('# Piped');
264+
expect(emitted).toContain('Streaming body');
265+
expect(emitted.endsWith('\n')).toBe(true);
266+
} finally {
267+
process.stdout.write = originalWrite;
268+
}
269+
});
270+
271+
it('keeps remote image URLs intact in stdout mode (no download)', async () => {
272+
const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
273+
tempDirs.push(tempDir);
274+
275+
const chunks: string[] = [];
276+
const originalWrite = process.stdout.write.bind(process.stdout);
277+
process.stdout.write = ((chunk: string | Uint8Array): boolean => {
278+
chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
279+
return true;
280+
}) as typeof process.stdout.write;
281+
282+
try {
283+
await downloadArticle({
284+
title: 'WithImage',
285+
contentHtml: '<p><img src="https://example.com/a.jpg"></p>',
286+
imageUrls: ['https://example.com/a.jpg'],
287+
}, {
288+
output: tempDir,
289+
downloadImages: true,
290+
stdout: true,
291+
});
292+
293+
expect(fs.readdirSync(tempDir)).toHaveLength(0);
294+
expect(chunks.join('')).toContain('https://example.com/a.jpg');
295+
} finally {
296+
process.stdout.write = originalWrite;
297+
}
298+
});
179299
});
180300
});

0 commit comments

Comments
 (0)