feat(web,download): absorb #1048 — video/audio/iframe + --stdout (#1146)

jackwener · web-flow · commit 648390eacd29 · 2026-04-22T18:42:38.000+08:00
* feat(web,download): absorb #1048 media + --stdout into web read Distill the useful pieces of the abandoned PR #1048 (`web md`) into the existing shared pipeline instead of introducing a parallel command: - Turndown rules for <video> / <audio> / <iframe>. Video and audio are emitted as inline HTML so renderers that support it keep playback, and iframes degrade to markdown links (title + src) so embedded content (YouTube, CodePen, …) stays reachable. `iframe` moves out of STRIPPED_TAGS since it's now handled explicitly. - `stdout` option on ArticleDownloadOptions: writes the full markdown to process.stdout, skips image download + mkdir + file write, and reports saved='-'. Remote image URLs stay intact so piped output is self-contained. - `web read --stdout` wires the above through. - Lazy-load src rewrite: the extractor now promotes data-src / data-original / data-lazy-src / data-srcset onto `src` before the HTML is frozen, so the markdown body and the image-download list reference the same URL (previously a page with placeholder.gif + data-src produced broken image links in the output). Nothing in #1048 that overlapped with the already-merged #1143 hardening was kept — no new Readability wiring, no duplicate Turndown config, no new command. * fix(web): keep stdout streaming output clean * fix(tests): update iframe e2e assertion and drop relative src import - article-extract e2e fixture test: iframe now converts to a markdown link instead of being stripped, so assert the YouTube embed link survives rather than asserting its absence. - clis/web/read.test.js: replace vi.importActual('../../src/registry.js') with a direct __test__.command export from read.js; the relative import into src/ tripped the package-exports adapter guardrail.
diff --git a/cli-manifest.json b/cli-manifest.json
@@ -17032,6 +17032,13 @@
         "default": 3,
         "required": false,
         "help": "Seconds to wait after page load"
+      },
+      {
+        "name": "stdout",
+        "type": "boolean",
+        "default": false,
+        "required": false,
+        "help": "Print markdown to stdout instead of saving to a file"
       }
     ],
     "columns": [
diff --git a/clis/web/read.js b/clis/web/read.js
@@ -15,7 +15,7 @@
  */
 import { cli, Strategy } from '@jackwener/opencli/registry';
 import { downloadArticle } from '@jackwener/opencli/download/article-download';
-cli({
+const command = cli({
     site: 'web',
     name: 'read',
     description: 'Fetch any web page and export as Markdown',
@@ -26,6 +26,7 @@ cli({
         { name: 'output', default: './web-articles', help: 'Output directory' },
         { name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' },
         { name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
+        { name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' },
     ],
     columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'],
     func: async (page, kwargs) => {
@@ -162,14 +163,26 @@ cli({
           if (el.children && el.children.length > 2) dedup(el);
         });
 
+        // --- Lazy-load image src rewrite ---
+        // Many sites render <img src="placeholder.gif" data-src="real.jpg">.
+        // Promote the real URL onto src so both the markdown body and the
+        // image download list reference the same URL.
+        clone.querySelectorAll('img').forEach(img => {
+          const srcset = img.getAttribute('data-srcset') || '';
+          const srcsetFirst = srcset.split(',')[0]?.trim().split(' ')[0] || '';
+          const real = img.getAttribute('data-src')
+            || img.getAttribute('data-original')
+            || img.getAttribute('data-lazy-src')
+            || srcsetFirst;
+          if (real) img.setAttribute('src', real);
+        });
+
         result.contentHtml = clone.innerHTML;
 
         // --- Image extraction ---
         const seen = new Set();
         clone.querySelectorAll('img').forEach(img => {
-          const src = img.getAttribute('data-src')
-            || img.getAttribute('data-original')
-            || img.getAttribute('src');
+          const src = img.getAttribute('src') || '';
           if (src && !src.startsWith('data:') && !seen.has(src)) {
             seen.add(src);
             result.imageUrls.push(src);
@@ -186,7 +199,7 @@ cli({
             referer = parsed.origin + '/';
         }
         catch { /* ignore */ }
-        return downloadArticle({
+        const result = await downloadArticle({
             title: data?.title || 'untitled',
             author: data?.author,
             publishTime: data?.publishTime,
@@ -197,6 +210,13 @@ cli({
             output: kwargs.output,
             downloadImages: kwargs['download-images'],
             imageHeaders: referer ? { Referer: referer } : undefined,
+            stdout: kwargs.stdout,
         });
+        // `--stdout` is a content-streaming mode. The markdown body already went
+        // to process.stdout inside downloadArticle(), so returning rows here
+        // would make Commander append table/JSON output to the same stdout
+        // stream and break piping.
+        return kwargs.stdout ? null : result;
     },
 });
+export const __test__ = { command };
diff --git a/clis/web/read.test.js b/clis/web/read.test.js
@@ -0,0 +1,76 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { mockDownloadArticle } = vi.hoisted(() => ({
+    mockDownloadArticle: vi.fn(),
+}));
+
+vi.mock('@jackwener/opencli/download/article-download', () => ({
+    downloadArticle: mockDownloadArticle,
+}));
+
+const { __test__ } = await import('./read.js');
+
+describe('web/read stdout behavior', () => {
+    const read = __test__.command;
+    const page = {
+        goto: vi.fn().mockResolvedValue(undefined),
+        wait: vi.fn().mockResolvedValue(undefined),
+        evaluate: vi.fn().mockResolvedValue({
+            title: 'Example Article',
+            author: 'Author',
+            publishTime: '2026-04-22',
+            contentHtml: '<p>hello</p>',
+            imageUrls: ['https://example.com/a.jpg'],
+        }),
+    };
+
+    beforeEach(() => {
+        mockDownloadArticle.mockReset();
+        mockDownloadArticle.mockResolvedValue([{
+            title: 'Example Article',
+            author: 'Author',
+            publish_time: '2026-04-22',
+            status: 'success',
+            size: '1 KB',
+            saved: '-',
+        }]);
+        page.goto.mockClear();
+        page.wait.mockClear();
+        page.evaluate.mockClear();
+    });
+
+    it('returns null in --stdout mode so the CLI does not append result rows to stdout', async () => {
+        const result = await read.func(page, {
+            url: 'https://example.com/article',
+            output: '/tmp/out',
+            'download-images': false,
+            stdout: true,
+        });
+
+        expect(result).toBeNull();
+        expect(mockDownloadArticle).toHaveBeenCalledWith(
+            expect.objectContaining({
+                title: 'Example Article',
+                sourceUrl: 'https://example.com/article',
+            }),
+            expect.objectContaining({
+                output: '/tmp/out',
+                stdout: true,
+            }),
+        );
+    });
+
+    it('still returns the saved-row payload when writing to disk', async () => {
+        const rows = [{ title: 'Example Article', saved: '/tmp/out/Example Article/example.md' }];
+        mockDownloadArticle.mockResolvedValue(rows);
+
+        const result = await read.func(page, {
+            url: 'https://example.com/article',
+            output: '/tmp/out',
+            'download-images': false,
+            stdout: false,
+        });
+
+        expect(result).toBe(rows);
+    });
+});
diff --git a/src/browser/article-extract.e2e.test.ts b/src/browser/article-extract.e2e.test.ts
@@ -81,7 +81,7 @@ describe('article extract → markdown e2e fixtures', () => {
     expect(md).not.toContain('Standard file extension');
   });
 
-  it('extracts a Deno blog fixture and strips embedded iframe chrome from markdown', async () => {
+  it('extracts a Deno blog fixture, preserves embedded iframes as markdown links, and drops page chrome', async () => {
     const url = 'https://deno.com/blog/v2.0';
     const article = runExtract(loadFixture('deno-v2.html'), url);
     expect(article?.source).toBe('readability');
@@ -91,7 +91,7 @@ describe('article extract → markdown e2e fixtures', () => {
     const md = await renderMarkdown(article, url);
     expect(md).toContain('## Announcing Deno 2');
     expect(md).toContain('The web is humanity’s largest software platform');
-    expect(md).not.toContain('youtube.com/embed');
+    expect(md).toMatch(/\]\(https:\/\/www\.youtube(?:-nocookie)?\.com\/embed\/[^)]+\)/);
     expect(md).not.toContain('Skip to main content');
   });
 
diff --git a/src/download/article-download.test.ts b/src/download/article-download.test.ts
@@ -80,20 +80,22 @@ describe('downloadArticle', () => {
       expect(md).toContain('[ ] todo');
     });
 
-    it('strips script / style / noscript / iframe / form', async () => {
+    it('strips script / style / noscript / form but keeps iframe as a link', async () => {
       const md = await runAndRead(
         '<p>keep</p>' +
         '<script>alert(1)</script>' +
         '<style>.x{color:red}</style>' +
         '<noscript>nojs</noscript>' +
-        '<iframe src="x"></iframe>' +
+        '<iframe src="https://www.youtube.com/embed/abc" title="Demo video"></iframe>' +
         '<form><button>click</button></form>',
       );
       expect(md).toContain('keep');
       expect(md).not.toContain('alert');
       expect(md).not.toContain('color:red');
       expect(md).not.toContain('nojs');
       expect(md).not.toContain('click');
+      // Iframe degrades to a link preserving the embedded URL.
+      expect(md).toContain('[Demo video](https://www.youtube.com/embed/abc)');
     });
 
     it('strips SVG nodes entirely', async () => {
@@ -176,5 +178,123 @@ describe('downloadArticle', () => {
       expect(md).toContain('also-keep');
       expect(md).not.toContain('strip-me');
     });
+
+    it('preserves <video> as inline HTML with src + poster', async () => {
+      const md = await runAndRead(
+        '<p>before</p>' +
+        '<video src="https://cdn.example.com/clip.mp4" poster="https://cdn.example.com/poster.jpg"></video>' +
+        '<p>after</p>',
+      );
+      expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls poster="https://cdn.example.com/poster.jpg"></video>');
+      expect(md).toContain('before');
+      expect(md).toContain('after');
+    });
+
+    it('falls back to <source> inside <video> when src attribute is absent', async () => {
+      const md = await runAndRead(
+        '<video><source src="https://cdn.example.com/clip.mp4" type="video/mp4"></video>',
+      );
+      expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls></video>');
+    });
+
+    it('drops <video> with no src and no <source>', async () => {
+      const md = await runAndRead('<p>before</p><video></video><p>after</p>');
+      expect(md).not.toContain('<video');
+      expect(md).toContain('before');
+      expect(md).toContain('after');
+    });
+
+    it('preserves <audio> as inline HTML', async () => {
+      const md = await runAndRead(
+        '<audio src="https://cdn.example.com/podcast.mp3"></audio>',
+      );
+      expect(md).toContain('<audio src="https://cdn.example.com/podcast.mp3" controls></audio>');
+    });
+
+    it('degrades <iframe> to a markdown link with title', async () => {
+      const md = await runAndRead(
+        '<iframe src="https://codepen.io/pen/abc" title="Live demo"></iframe>',
+      );
+      expect(md).toContain('[Live demo](https://codepen.io/pen/abc)');
+    });
+
+    it('defaults iframe title to "Embedded content" when missing', async () => {
+      const md = await runAndRead(
+        '<iframe src="https://example.com/embed"></iframe>',
+      );
+      expect(md).toContain('[Embedded content](https://example.com/embed)');
+    });
+
+    it('drops <iframe> with no src', async () => {
+      const md = await runAndRead('<p>before</p><iframe></iframe><p>after</p>');
+      expect(md).not.toContain('iframe');
+      expect(md).toContain('before');
+      expect(md).toContain('after');
+    });
+  });
+
+  describe('stdout mode', () => {
+    it('writes markdown to process.stdout and skips file write', async () => {
+      const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
+      tempDirs.push(tempDir);
+
+      const chunks: string[] = [];
+      const originalWrite = process.stdout.write.bind(process.stdout);
+      process.stdout.write = ((chunk: string | Uint8Array): boolean => {
+        chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
+        return true;
+      }) as typeof process.stdout.write;
+
+      try {
+        const result = await downloadArticle({
+          title: 'Piped',
+          contentHtml: '<p>Streaming body</p>',
+          sourceUrl: 'https://example.com/a',
+        }, {
+          output: tempDir,
+          stdout: true,
+        });
+
+        expect(result[0].status).toBe('success');
+        expect(result[0].saved).toBe('-');
+        expect(fs.readdirSync(tempDir)).toHaveLength(0);
+
+        const emitted = chunks.join('');
+        expect(emitted).toContain('# Piped');
+        expect(emitted).toContain('Streaming body');
+        expect(emitted.endsWith('\n')).toBe(true);
+      } finally {
+        process.stdout.write = originalWrite;
+      }
+    });
+
+    it('keeps remote image URLs intact in stdout mode (no download)', async () => {
+      const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
+      tempDirs.push(tempDir);
+
+      const chunks: string[] = [];
+      const originalWrite = process.stdout.write.bind(process.stdout);
+      process.stdout.write = ((chunk: string | Uint8Array): boolean => {
+        chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
+        return true;
+      }) as typeof process.stdout.write;
+
+      try {
+        await downloadArticle({
+          title: 'WithImage',
+          contentHtml: '<p><img src="https://example.com/a.jpg"></p>',
+          imageUrls: ['https://example.com/a.jpg'],
+        }, {
+          output: tempDir,
+          downloadImages: true,
+          stdout: true,
+        });
+
+        expect(fs.readdirSync(tempDir)).toHaveLength(0);
+        expect(chunks.join('')).toContain('https://example.com/a.jpg');
+      } finally {
+        process.stdout.write = originalWrite;
+      }
+    });
   });
 });
diff --git a/src/download/article-download.ts b/src/download/article-download.ts