From d76c4d9994bea794f43d332dfb59a83d54859435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=BB=A5=E7=90=B3?= Date: Tue, 26 May 2026 22:53:17 +0800 Subject: [PATCH 1/2] feat(douyin): add search command for keyword video search DOM extraction from www.douyin.com/search/?type=video. Requires logged-in profile. plays/comments/shares exposed as 0 (card markup only surfaces likes); see Follow-ups for full-counter path. Schema aligned with tiktok search. Refs https://github.com/Daily-AC/omnireach/issues/12 --- cli-manifest.json | 39 ++++++ clis/douyin/search.js | 264 +++++++++++++++++++++++++++++++++++++ clis/douyin/search.test.js | 232 ++++++++++++++++++++++++++++++++ 3 files changed, 535 insertions(+) create mode 100644 clis/douyin/search.js create mode 100644 clis/douyin/search.test.js diff --git a/cli-manifest.json b/cli-manifest.json index e79ac2eb1..d1fa195bf 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -9418,6 +9418,45 @@ "sourceFile": "douyin/publish.js", "navigateBefore": "https://creator.douyin.com" }, + { + "site": "douyin", + "name": "search", + "description": "关键词搜索抖音视频", + "access": "read", + "domain": "www.douyin.com", + "strategy": "cookie", + "browser": true, + "args": [ + { + "name": "query", + "type": "str", + "required": true, + "positional": true, + "help": "搜索关键词" + }, + { + "name": "limit", + "type": "int", + "default": 10, + "required": false, + "help": "结果数量 (1-30)" + } + ], + "columns": [ + "rank", + "desc", + "author", + "url", + "plays", + "likes", + "comments", + "shares" + ], + "type": "js", + "modulePath": "douyin/search.js", + "sourceFile": "douyin/search.js", + "navigateBefore": "https://www.douyin.com" + }, { "site": "douyin", "name": "stats", diff --git a/clis/douyin/search.js b/clis/douyin/search.js new file mode 100644 index 000000000..9540cbff6 --- /dev/null +++ b/clis/douyin/search.js @@ -0,0 +1,264 @@ +/** + * Douyin search — keyword video search on www.douyin.com. + * + * Strategy: DOM extraction from the server-rendered search results page. + * + * Why not XHR interception: + * The `www.douyin.com/search/?type=video` page renders results into + * `
    ` server-side during initial navigation + * and (for the OpenCLI-bridged browser context) does NOT fire a + * subsequent `/aweme/v1/web/general/search/single/` XHR — we confirmed + * this by `wait xhr "general/search/single"` timing out at 20s on a + * logged-in profile that has visible result cards in the DOM. Direct + * synthesis of the XHR from page context returns + * `status_code: 0, data: [], search_nil_info: { search_nil_type: + * "verify_check" }` because the bare URL lacks the SPA-computed + * `a_bogus` / `msToken` signature. + * + * DOM extraction sidesteps both blockers: the data is already in the + * rendered HTML at the moment of navigation, signature-free. + * + * Selector approach: + * Douyin obfuscates card classnames (e.g. `.ckopQfVu`, `.cIiU4Muu`) + * and they churn between builds. We pin only the stable hooks: + * - container: `[data-e2e="scroll-list"]` + * - row: `li` inside the container + * - url: `a[href*="/video/"]` + * - other fields are extracted from the row's leaf text nodes by + * SHAPE (digit+万/亿 → likes; HH:MM or MM:SS → duration; text after + * `@` → author nickname; longest remaining → desc). + * + * Output fields mirror `tiktok search` (rank, desc, author, url, plays, + * likes, comments, shares) so downstream tools that already normalize + * tiktok rows can consume douyin rows without per-adapter glue. The + * search results page only surfaces the like count — plays/comments/ + * shares are not in the card markup and we expose them as 0 rather + * than fabricate values; clients that need them should fetch + * /aweme/v1/web/aweme/detail/?aweme_id=... for the relevant id. + * + * Prerequisite: the bound Chrome profile must be logged in to + * https://www.douyin.com. The search results page renders an empty + * skeleton for anonymous visitors, which we surface as AuthRequiredError. + */ +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { ArgumentError, AuthRequiredError, CommandExecutionError } from '@jackwener/opencli/errors'; + +export const MAX_SEARCH_LIMIT = 30; +// Time budget for the SPA's initial DOM commit. Empirically the +// scroll-list `
  • ` rows appear within 2-4s of navigation when logged +// in; 15s covers slow networks without blocking on a permanently-empty +// page (anonymous gate, network error). +export const RENDER_TIMEOUT_MS = 15000; + +export function parseSearchLimit(raw) { + const parsed = Number(raw ?? 10); + if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) { + throw new ArgumentError(`--limit must be an integer between 1 and ${MAX_SEARCH_LIMIT}, got ${JSON.stringify(raw)}`); + } + if (parsed < 1 || parsed > MAX_SEARCH_LIMIT) { + throw new ArgumentError(`--limit must be between 1 and ${MAX_SEARCH_LIMIT}, got ${parsed}`); + } + return parsed; +} + +/** + * Parse a Douyin display count like "1.9万", "3.1万", "4702", "1.2亿" + * into a plain integer. Returns 0 for unparseable input rather than + * throwing — the CLI promises numeric columns and missing data is + * common enough on real result rows that a soft fallback is the right + * choice. + */ +export function parseDouyinCount(text) { + if (typeof text !== 'string') return 0; + const m = text.replace(/\s/g, '').match(/^(\d+(?:\.\d+)?)([万亿])?$/); + if (!m) { + const plain = Number(text.replace(/[,\s]/g, '')); + return Number.isFinite(plain) ? Math.round(plain) : 0; + } + const n = Number(m[1]); + if (!Number.isFinite(n)) return 0; + if (m[2] === '万') return Math.round(n * 10_000); + if (m[2] === '亿') return Math.round(n * 100_000_000); + return Math.round(n); +} + +/** + * Resolve scheme-relative or absolute Douyin video links to the canonical + * https://www.douyin.com/video/ shape. Returns '' for unparseable + * input rather than throwing — callers expect a string column. + */ +export function normalizeDouyinVideoUrl(href) { + if (typeof href !== 'string' || !href) return ''; + let full = href; + if (full.startsWith('//')) full = 'https:' + full; + else if (full.startsWith('/')) full = 'https://www.douyin.com' + full; + const idMatch = full.match(/\/video\/(\d+)/); + if (idMatch) return `https://www.douyin.com/video/${idMatch[1]}`; + return full; +} + +/** + * Project a single rendered card into the canonical row shape. Operates + * on a serialized card payload (the raw `{url, leafTexts}` we collect + * via page.evaluate) so this function is unit-testable without a real + * browser. + * + * `leafTexts` is the ordered list of `textContent.trim()` for every leaf + * element inside the card (no children). The fields we want are + * identified by shape: + * - duration: matches `HH:MM:SS` or `MM:SS` + * - likes: matches `(.)?(万|亿)?` and ISN'T the duration + * - author: the text node immediately following an `@` text node + * - desc: the longest remaining leaf text + */ +export function projectCard(card, index) { + const url = normalizeDouyinVideoUrl(card?.url); + const texts = Array.isArray(card?.leafTexts) ? card.leafTexts.map((t) => String(t ?? '').trim()).filter(Boolean) : []; + + const DURATION_RE = /^\d{1,2}:\d{2}(?::\d{2})?$/; + const COUNT_RE = /^\d+(?:\.\d+)?[万亿]?$/; + + let likes = 0; + let author = ''; + let longest = ''; + + for (let i = 0; i < texts.length; i++) { + const t = texts[i]; + if (DURATION_RE.test(t)) continue; + if (!likes && COUNT_RE.test(t)) { + likes = parseDouyinCount(t); + continue; + } + if (t === '@' && !author) { + author = (texts[i + 1] ?? '').trim(); + continue; + } + if (t === author) continue; + if (t.length > longest.length) longest = t; + } + let desc = longest; + // Strip a leading "@author" that some renders fuse into the desc text node. + if (author && desc.startsWith('@' + author)) { + desc = desc.slice(author.length + 1).trim(); + } + return { + rank: index + 1, + desc, + author, + url, + plays: 0, + likes, + comments: 0, + shares: 0, + }; +} + +// JS snippet that waits for the scroll-list to populate, then returns +// `{state: 'rendered', cards}` or `{state: 'login_wall'}` / +// `{state: 'timeout'}`. Runs inside page.evaluate so we don't pay a +// round-trip per poll iteration. +const WAIT_AND_EXTRACT_JS = (timeoutMs) => ` + new Promise((resolve) => { + const collectCards = () => { + const cards = []; + const lis = document.querySelectorAll('[data-e2e="scroll-list"] li'); + for (const li of lis) { + const a = li.querySelector('a[href*="/video/"]'); + if (!a) continue; + const leafTexts = []; + for (const el of li.querySelectorAll('*')) { + if (el.children.length > 0) continue; + const t = (el.textContent || '').trim(); + if (t) leafTexts.push(t); + } + cards.push({ url: a.getAttribute('href') || '', leafTexts }); + } + return cards; + }; + const detectState = () => { + const cards = collectCards(); + if (cards.length > 0) return { state: 'rendered', cards }; + // Anonymous gate: Douyin renders a centered "登录后查看更多内容" + // overlay on /search/ for visitors without sessionid. Match either + // the literal Chinese prompt or a visible login modal/mask. + const text = (document.body && document.body.innerText) || ''; + if (/登录后查看|请先登录|登录抖音/.test(text)) return { state: 'login_wall' }; + const modal = document.querySelector('[class*="login-mask"], [class*="LoginMask"], [class*="login-modal"], dialog[role="dialog"]'); + if (modal && modal instanceof HTMLElement) { + const r = modal.getBoundingClientRect(); + const s = getComputedStyle(modal); + if (r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden') { + return { state: 'login_wall' }; + } + } + return null; + }; + const found = detectState(); + if (found) return resolve(found); + const observer = new MutationObserver(() => { + const s = detectState(); + if (s) { observer.disconnect(); resolve(s); } + }); + observer.observe(document.body, { childList: true, subtree: true }); + setTimeout(() => { + observer.disconnect(); + const fallback = detectState(); + resolve(fallback ?? { state: 'timeout' }); + }, ${timeoutMs}); + }) +`; + +function unwrapEvaluateResult(payload) { + if (payload && !Array.isArray(payload) && typeof payload === 'object' && 'session' in payload && 'data' in payload) { + return payload.data; + } + return payload; +} + +cli({ + site: 'douyin', + name: 'search', + access: 'read', + description: '关键词搜索抖音视频', + domain: 'www.douyin.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'query', required: true, positional: true, help: '搜索关键词' }, + { name: 'limit', type: 'int', default: 10, help: `结果数量 (1-${MAX_SEARCH_LIMIT})` }, + ], + columns: ['rank', 'desc', 'author', 'url', 'plays', 'likes', 'comments', 'shares'], + func: async (page, kwargs) => { + const limit = parseSearchLimit(kwargs.limit); + const keyword = String(kwargs.query ?? '').trim(); + if (!keyword) { + throw new ArgumentError('douyin search 需要 关键词'); + } + await page.goto(`https://www.douyin.com/search/${encodeURIComponent(keyword)}?type=video`); + let result; + try { + result = unwrapEvaluateResult(await page.evaluate(WAIT_AND_EXTRACT_JS(RENDER_TIMEOUT_MS))); + } catch (error) { + throw new CommandExecutionError(`Douyin search extraction failed: ${error instanceof Error ? error.message : String(error)}`); + } + if (!result || typeof result !== 'object') { + throw new CommandExecutionError('Douyin search: unexpected evaluator payload shape'); + } + if (result.state === 'login_wall') { + throw new AuthRequiredError( + 'www.douyin.com', + 'Douyin search results are blocked behind a login wall — log in at https://www.douyin.com in Chrome first.', + ); + } + if (result.state === 'timeout' || !Array.isArray(result.cards) || result.cards.length === 0) { + // No cards rendered within the budget AND no explicit login + // wall detected. Most common cause is still an unauthenticated + // session (the page just hides results silently); surface as + // AuthRequiredError with the same actionable message. + throw new AuthRequiredError( + 'www.douyin.com', + 'Douyin search returned no results. Log in to https://www.douyin.com in Chrome — anonymous sessions get an empty results page without a visible login prompt.', + ); + } + return result.cards.slice(0, limit).map((card, index) => projectCard(card, index)); + }, +}); diff --git a/clis/douyin/search.test.js b/clis/douyin/search.test.js new file mode 100644 index 000000000..63659c3b3 --- /dev/null +++ b/clis/douyin/search.test.js @@ -0,0 +1,232 @@ +import { describe, expect, it, vi } from 'vitest'; +import { getRegistry } from '@jackwener/opencli/registry'; +import { + MAX_SEARCH_LIMIT, + normalizeDouyinVideoUrl, + parseDouyinCount, + parseSearchLimit, + projectCard, +} from './search.js'; + +function createPageMock({ evaluateResult } = {}) { + return { + goto: vi.fn().mockResolvedValue(undefined), + wait: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(evaluateResult), + }; +} + +describe('douyin search', () => { + it('registers the command on www.douyin.com', () => { + const registry = getRegistry(); + const cmd = [...registry.values()].find((c) => c.site === 'douyin' && c.name === 'search'); + expect(cmd).toBeDefined(); + expect(cmd?.domain).toBe('www.douyin.com'); + }); + + it('rejects invalid limit before navigation', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock(); + await expect(cmd.func(page, { query: '咖啡', limit: 0 })).rejects.toMatchObject({ + code: 'ARGUMENT', + message: expect.stringContaining('--limit'), + }); + expect(page.goto).not.toHaveBeenCalled(); + expect(page.evaluate).not.toHaveBeenCalled(); + }); + + it('rejects limit above MAX_SEARCH_LIMIT', () => { + expect(() => parseSearchLimit(MAX_SEARCH_LIMIT + 1)).toThrow(/--limit/); + }); + + it('rejects an empty query', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock(); + await expect(cmd.func(page, { query: ' ', limit: 5 })).rejects.toMatchObject({ + code: 'ARGUMENT', + }); + expect(page.goto).not.toHaveBeenCalled(); + }); + + it('returns ranked cards from the rendered scroll-list', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ + evaluateResult: { + state: 'rendered', + cards: [ + { + url: '//www.douyin.com/video/7585120459717365001', + leafTexts: [ + '合集', + '03:55', + '1.9万', + 'Python邪修,5分钟学完Python基础 #python #编程', + '@', + '校长讲python(无小号)', + '5月前', + ], + }, + ], + }, + }); + const rows = await cmd.func(page, { query: 'python', limit: 5 }); + expect(page.goto).toHaveBeenCalledWith('https://www.douyin.com/search/python?type=video'); + expect(rows).toEqual([ + { + rank: 1, + desc: 'Python邪修,5分钟学完Python基础 #python #编程', + author: '校长讲python(无小号)', + url: 'https://www.douyin.com/video/7585120459717365001', + plays: 0, + likes: 19000, + comments: 0, + shares: 0, + }, + ]); + }); + + it('encodes Chinese keywords in the URL path', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'rendered', cards: [{ url: '/video/1', leafTexts: ['hi'] }] } }); + await cmd.func(page, { query: 'AI 编程', limit: 1 }); + expect(page.goto).toHaveBeenCalledWith('https://www.douyin.com/search/AI%20%E7%BC%96%E7%A8%8B?type=video'); + }); + + it('respects --limit cap when the page rendered more cards than requested', async () => { + const cmd = getRegistry().get('douyin/search'); + const cards = Array.from({ length: 12 }, (_, i) => ({ + url: `//www.douyin.com/video/100000${i}`, + leafTexts: ['03:00', `${i + 1}万`, `video ${i}`, '@', `user${i}`], + })); + const page = createPageMock({ evaluateResult: { state: 'rendered', cards } }); + const rows = await cmd.func(page, { query: 'x', limit: 3 }); + expect(rows).toHaveLength(3); + expect(rows.map((r) => r.url)).toEqual([ + 'https://www.douyin.com/video/1000000', + 'https://www.douyin.com/video/1000001', + 'https://www.douyin.com/video/1000002', + ]); + }); + + it('maps the explicit login-wall state to AuthRequiredError', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'login_wall' } }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'AUTH_REQUIRED', + message: expect.stringContaining('login wall'), + }); + }); + + it('maps the timeout/empty state to AuthRequiredError (anonymous sessions get a silent empty page)', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'timeout' } }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'AUTH_REQUIRED', + }); + }); + + it('unwraps Browser Bridge {session, data} envelopes before inspecting state', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ + evaluateResult: { + session: 'site:douyin', + data: { state: 'rendered', cards: [{ url: '/video/9', leafTexts: ['demo'] }] }, + }, + }); + const rows = await cmd.func(page, { query: 'x', limit: 1 }); + expect(rows).toHaveLength(1); + expect(rows[0].url).toBe('https://www.douyin.com/video/9'); + }); + + it('throws CommandExecutionError on malformed evaluator payload', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: 'not-an-object' }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + }); + }); +}); + +describe('parseDouyinCount', () => { + it.each([ + ['1.9万', 19_000], + ['3万', 30_000], + ['4702', 4702], + ['1,234', 1234], + ['1.2亿', 120_000_000], + ['', 0], + ['unknown', 0], + [null, 0], + [undefined, 0], + ])('parses %j as %i', (input, expected) => { + expect(parseDouyinCount(input)).toBe(expected); + }); +}); + +describe('normalizeDouyinVideoUrl', () => { + it.each([ + ['//www.douyin.com/video/123', 'https://www.douyin.com/video/123'], + ['/video/123?foo=bar', 'https://www.douyin.com/video/123'], + ['https://www.douyin.com/video/123?something', 'https://www.douyin.com/video/123'], + ['', ''], + [null, ''], + ])('normalizes %j → %j', (input, expected) => { + expect(normalizeDouyinVideoUrl(input)).toBe(expected); + }); +}); + +describe('projectCard', () => { + it('extracts duration/likes/desc/author by leaf-text shape, classname-agnostic', () => { + const row = projectCard({ + url: '//www.douyin.com/video/7585120459717365001', + leafTexts: ['合集', '03:55', '1.9万', 'Python邪修', '@', '校长', '5月前'], + }, 0); + expect(row).toEqual({ + rank: 1, + desc: 'Python邪修', + author: '校长', + url: 'https://www.douyin.com/video/7585120459717365001', + plays: 0, + likes: 19000, + comments: 0, + shares: 0, + }); + }); + + it('returns the longest non-skipped text as desc, not the publish-date suffix', () => { + const row = projectCard({ + url: '/video/1', + leafTexts: ['02:00', '4702', 'hi long-text', '@', 'user', '1月前'], + }, 0); + expect(row.desc).toBe('hi long-text'); + expect(row.author).toBe('user'); + }); + + it('strips a fused @author prefix from the desc when present', () => { + const row = projectCard({ + url: '/video/1', + leafTexts: ['02:00', '100', '@alice this is the caption', '@', 'alice'], + }, 0); + expect(row.author).toBe('alice'); + expect(row.desc).toBe('this is the caption'); + }); + + it('returns safe defaults when leafTexts is missing', () => { + const row = projectCard({ url: '/video/42', leafTexts: undefined }, 4); + expect(row).toEqual({ + rank: 5, + desc: '', + author: '', + url: 'https://www.douyin.com/video/42', + plays: 0, + likes: 0, + comments: 0, + shares: 0, + }); + }); + + it('returns rank=index+1 regardless of input', () => { + const row = projectCard({ url: '/video/1', leafTexts: ['x'] }, 9); + expect(row.rank).toBe(10); + }); +}); From fe28823039d734655924369dcb430953fa64914f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=BB=A5=E7=90=B3?= Date: Wed, 27 May 2026 19:53:51 +0800 Subject: [PATCH 2/2] feat(weixin): add --stdout flag to download command for piping markdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the --stdout pattern already used by clis/web/read.js, which threads through the existing `stdout?: boolean` option in ArticleDownloadOptions (src/download/article-download.ts). When --stdout is set, the markdown body is written to process.stdout via the existing downloadArticle() helper, image downloads are skipped (downloadArticle handles this), and the CLI func returns null to suppress Commander's row output that would otherwise corrupt the piped stream. The errorHint early-return path is unchanged — verification gate detection runs before downloadArticle() is called, so --stdout has zero effect on that branch. --- cli-manifest.json | 7 ++ clis/weixin/download.js | 9 +- clis/weixin/download.test.js | 182 +++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 clis/weixin/download.test.js diff --git a/cli-manifest.json b/cli-manifest.json index d1fa195bf..6756267fd 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -27825,6 +27825,13 @@ "default": true, "required": false, "help": "Download images locally" + }, + { + "name": "stdout", + "type": "boolean", + "default": false, + "required": false, + "help": "Print markdown to stdout instead of saving to a file" } ], "columns": [ diff --git a/clis/weixin/download.js b/clis/weixin/download.js index 0b8ff59a0..cdf1f101b 100644 --- a/clis/weixin/download.js +++ b/clis/weixin/download.js @@ -179,6 +179,7 @@ cli({ { name: 'url', required: true, help: 'WeChat article URL (mp.weixin.qq.com/s/xxx)' }, { name: 'output', default: './weixin-articles', help: 'Output directory' }, { name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' }, + { name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' }, ], columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'], func: async (page, kwargs) => { @@ -301,7 +302,7 @@ cli({ saved: '-', }]; } - return downloadArticle({ + const result = await downloadArticle({ title: data?.title || '', author: data?.author, publishTime: data?.publishTime, @@ -318,6 +319,12 @@ cli({ const m = url.match(/wx_fmt=(\w+)/) || url.match(/\.(\w{3,4})(?:\?|$)/); return m ? m[1] : 'png'; }, + stdout: kwargs.stdout, }); + // `--stdout` is a content-streaming mode. The markdown body already went + // to process.stdout inside downloadArticle(), so returning rows here + // would make Commander append table/JSON output to the same stdout + // stream and break piping. Mirror clis/web/read.js's pattern. + return kwargs.stdout ? null : result; }, }); diff --git a/clis/weixin/download.test.js b/clis/weixin/download.test.js new file mode 100644 index 000000000..01c393ff7 --- /dev/null +++ b/clis/weixin/download.test.js @@ -0,0 +1,182 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { getRegistry } from '@jackwener/opencli/registry'; + +const { mockDownloadArticle } = vi.hoisted(() => ({ + mockDownloadArticle: vi.fn(), +})); + +vi.mock('@jackwener/opencli/download/article-download', () => ({ + downloadArticle: mockDownloadArticle, +})); + +// Import side-effect — registers `weixin/download` in the registry. +await import('./download.js'); + +describe('weixin download command', () => { + const command = getRegistry().get('weixin/download'); + + const extractedArticle = { + title: '测试文章', + author: '某公众号', + publishTime: '2026-05-27', + errorHint: '', + contentHtml: '

    正文内容

    ', + codeBlocks: [], + imageUrls: ['https://example.com/img.jpg'], + }; + + const verificationGatePayload = { + title: '', + author: '', + publishTime: '', + errorHint: 'environment verification required', + contentHtml: '', + codeBlocks: [], + imageUrls: [], + }; + + let page; + + beforeEach(() => { + vi.restoreAllMocks(); + mockDownloadArticle.mockReset(); + mockDownloadArticle.mockResolvedValue([{ + title: '测试文章', + author: '某公众号', + publish_time: '2026-05-27', + status: 'success', + size: '1 KB', + saved: '/tmp/out/测试文章/测试文章.md', + }]); + page = { + goto: vi.fn().mockResolvedValue(undefined), + wait: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(extractedArticle), + }; + }); + + it('registers as a cookie-strategy browser command for mp.weixin.qq.com', () => { + expect(command).toBeDefined(); + expect(command.site).toBe('weixin'); + expect(command.name).toBe('download'); + expect(command.strategy).toBe('cookie'); + expect(command.domain).toBe('mp.weixin.qq.com'); + }); + + it('exposes a boolean --stdout flag defaulting to false', () => { + const stdoutArg = command.args.find((a) => a.name === 'stdout'); + expect(stdoutArg).toBeDefined(); + expect(stdoutArg.type).toBe('boolean'); + expect(stdoutArg.default).toBe(false); + }); + + it('rejects non-mp.weixin.qq.com URLs before any browser navigation', async () => { + const result = await command.func(page, { + url: 'https://example.com/article', + output: '/tmp/out', + 'download-images': false, + stdout: false, + }); + expect(result).toEqual([ + expect.objectContaining({ status: 'invalid URL' }), + ]); + expect(page.goto).not.toHaveBeenCalled(); + }); + + describe('--stdout=true behavior', () => { + it('passes stdout:true through to downloadArticle and returns null to suppress row output', async () => { + const result = await command.func(page, { + url: 'https://mp.weixin.qq.com/s/abc123', + output: '/tmp/out', + 'download-images': true, + stdout: true, + }); + + expect(result).toBeNull(); + expect(mockDownloadArticle).toHaveBeenCalledTimes(1); + const [data, options] = mockDownloadArticle.mock.calls[0]; + expect(data).toEqual(expect.objectContaining({ + title: '测试文章', + author: '某公众号', + publishTime: '2026-05-27', + sourceUrl: 'https://mp.weixin.qq.com/s/abc123', + contentHtml: '

    正文内容

    ', + })); + expect(options).toEqual(expect.objectContaining({ + output: '/tmp/out', + downloadImages: true, + stdout: true, + imageHeaders: { Referer: 'https://mp.weixin.qq.com/' }, + frontmatterLabels: { author: '公众号' }, + })); + }); + + it('takes the errorHint early-return path BEFORE downloadArticle, even when --stdout=true', async () => { + // Lock the §5 semantic: errorHint detection runs in-page (line 242 of download.js) + // and short-circuits the cli func at line 294, never reaching downloadArticle(). + // --stdout has zero effect on this branch — it must return the structured + // verification-required row regardless, so omnireach can read row.status + // and surface captcha_suspected. + page.evaluate.mockResolvedValue(verificationGatePayload); + + const result = await command.func(page, { + url: 'https://mp.weixin.qq.com/s/blocked', + output: '/tmp/out', + 'download-images': true, + stdout: true, + }); + + expect(mockDownloadArticle).not.toHaveBeenCalled(); + expect(result).toEqual([ + expect.objectContaining({ + title: 'Error', + status: expect.stringContaining('verification required'), + }), + ]); + }); + }); + + describe('--stdout=false (default) behavior', () => { + it('passes stdout:false through to downloadArticle and returns the row payload unchanged', async () => { + const savedRows = [{ + title: '测试文章', + author: '某公众号', + publish_time: '2026-05-27', + status: 'success', + size: '2 KB', + saved: '/tmp/out/测试文章/测试文章.md', + }]; + mockDownloadArticle.mockResolvedValue(savedRows); + + const result = await command.func(page, { + url: 'https://mp.weixin.qq.com/s/abc123', + output: '/tmp/out', + 'download-images': true, + stdout: false, + }); + + expect(result).toBe(savedRows); + const [, options] = mockDownloadArticle.mock.calls[0]; + expect(options.stdout).toBe(false); + }); + + it('returns the verification-required row when errorHint is set (no --stdout)', async () => { + page.evaluate.mockResolvedValue(verificationGatePayload); + + const result = await command.func(page, { + url: 'https://mp.weixin.qq.com/s/blocked', + output: '/tmp/out', + 'download-images': true, + stdout: false, + }); + + expect(mockDownloadArticle).not.toHaveBeenCalled(); + expect(result).toEqual([ + expect.objectContaining({ + title: 'Error', + status: expect.stringContaining('verification required'), + }), + ]); + }); + }); +});