Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions cli-manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -9418,6 +9418,45 @@
"sourceFile": "douyin/publish.js",
"navigateBefore": "https://creator.douyin.com"
},
{
"site": "douyin",
"name": "search",
"description": "关键词搜索抖音视频",
"access": "read",
"domain": "www.douyin.com",
"strategy": "cookie",
"browser": true,
"args": [
{
"name": "query",
"type": "str",
"required": true,
"positional": true,
"help": "搜索关键词"
},
{
"name": "limit",
"type": "int",
"default": 10,
"required": false,
"help": "结果数量 (1-30)"
}
],
"columns": [
"rank",
"desc",
"author",
"url",
"plays",
"likes",
"comments",
"shares"
],
"type": "js",
"modulePath": "douyin/search.js",
"sourceFile": "douyin/search.js",
"navigateBefore": "https://www.douyin.com"
},
{
"site": "douyin",
"name": "stats",
Expand Down Expand Up @@ -27786,6 +27825,13 @@
"default": true,
"required": false,
"help": "Download images locally"
},
{
"name": "stdout",
"type": "boolean",
"default": false,
"required": false,
"help": "Print markdown to stdout instead of saving to a file"
}
],
"columns": [
Expand Down
264 changes: 264 additions & 0 deletions clis/douyin/search.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
/**
* Douyin search — keyword video search on www.douyin.com.
*
* Strategy: DOM extraction from the server-rendered search results page.
*
* Why not XHR interception:
* The `www.douyin.com/search/<q>?type=video` page renders results into
* `<ul data-e2e="scroll-list">` server-side during initial navigation
* and (for the OpenCLI-bridged browser context) does NOT fire a
* subsequent `/aweme/v1/web/general/search/single/` XHR — we confirmed
* this by `wait xhr "general/search/single"` timing out at 20s on a
* logged-in profile that has visible result cards in the DOM. Direct
* synthesis of the XHR from page context returns
* `status_code: 0, data: [], search_nil_info: { search_nil_type:
* "verify_check" }` because the bare URL lacks the SPA-computed
* `a_bogus` / `msToken` signature.
*
* DOM extraction sidesteps both blockers: the data is already in the
* rendered HTML at the moment of navigation, signature-free.
*
* Selector approach:
* Douyin obfuscates card classnames (e.g. `.ckopQfVu`, `.cIiU4Muu`)
* and they churn between builds. We pin only the stable hooks:
* - container: `[data-e2e="scroll-list"]`
* - row: `li` inside the container
* - url: `a[href*="/video/"]`
* - other fields are extracted from the row's leaf text nodes by
* SHAPE (digit+万/亿 → likes; HH:MM or MM:SS → duration; text after
* `@` → author nickname; longest remaining → desc).
*
* Output fields mirror `tiktok search` (rank, desc, author, url, plays,
* likes, comments, shares) so downstream tools that already normalize
* tiktok rows can consume douyin rows without per-adapter glue. The
* search results page only surfaces the like count — plays/comments/
* shares are not in the card markup and we expose them as 0 rather
* than fabricate values; clients that need them should fetch
* /aweme/v1/web/aweme/detail/?aweme_id=... for the relevant id.
*
* Prerequisite: the bound Chrome profile must be logged in to
* https://www.douyin.com. The search results page renders an empty
* skeleton for anonymous visitors, which we surface as AuthRequiredError.
*/
import { cli, Strategy } from '@jackwener/opencli/registry';
import { ArgumentError, AuthRequiredError, CommandExecutionError } from '@jackwener/opencli/errors';

export const MAX_SEARCH_LIMIT = 30;
// Time budget for the SPA's initial DOM commit. Empirically the
// scroll-list `<li>` rows appear within 2-4s of navigation when logged
// in; 15s covers slow networks without blocking on a permanently-empty
// page (anonymous gate, network error).
export const RENDER_TIMEOUT_MS = 15000;

export function parseSearchLimit(raw) {
const parsed = Number(raw ?? 10);
if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) {
throw new ArgumentError(`--limit must be an integer between 1 and ${MAX_SEARCH_LIMIT}, got ${JSON.stringify(raw)}`);
}
if (parsed < 1 || parsed > MAX_SEARCH_LIMIT) {
throw new ArgumentError(`--limit must be between 1 and ${MAX_SEARCH_LIMIT}, got ${parsed}`);
}
return parsed;
}

/**
* Parse a Douyin display count like "1.9万", "3.1万", "4702", "1.2亿"
* into a plain integer. Returns 0 for unparseable input rather than
* throwing — the CLI promises numeric columns and missing data is
* common enough on real result rows that a soft fallback is the right
* choice.
*/
export function parseDouyinCount(text) {
if (typeof text !== 'string') return 0;
const m = text.replace(/\s/g, '').match(/^(\d+(?:\.\d+)?)([万亿])?$/);
if (!m) {
const plain = Number(text.replace(/[,\s]/g, ''));
return Number.isFinite(plain) ? Math.round(plain) : 0;
}
const n = Number(m[1]);
if (!Number.isFinite(n)) return 0;
if (m[2] === '万') return Math.round(n * 10_000);
if (m[2] === '亿') return Math.round(n * 100_000_000);
return Math.round(n);
}

/**
* Resolve scheme-relative or absolute Douyin video links to the canonical
* https://www.douyin.com/video/<id> shape. Returns '' for unparseable
* input rather than throwing — callers expect a string column.
*/
export function normalizeDouyinVideoUrl(href) {
if (typeof href !== 'string' || !href) return '';
let full = href;
if (full.startsWith('//')) full = 'https:' + full;
else if (full.startsWith('/')) full = 'https://www.douyin.com' + full;
const idMatch = full.match(/\/video\/(\d+)/);
if (idMatch) return `https://www.douyin.com/video/${idMatch[1]}`;
return full;
}

/**
* Project a single rendered card into the canonical row shape. Operates
* on a serialized card payload (the raw `{url, leafTexts}` we collect
* via page.evaluate) so this function is unit-testable without a real
* browser.
*
* `leafTexts` is the ordered list of `textContent.trim()` for every leaf
* element inside the card (no children). The fields we want are
* identified by shape:
* - duration: matches `HH:MM:SS` or `MM:SS`
* - likes: matches `<digits>(.<digits>)?(万|亿)?` and ISN'T the duration
* - author: the text node immediately following an `@` text node
* - desc: the longest remaining leaf text
*/
export function projectCard(card, index) {
const url = normalizeDouyinVideoUrl(card?.url);
const texts = Array.isArray(card?.leafTexts) ? card.leafTexts.map((t) => String(t ?? '').trim()).filter(Boolean) : [];

const DURATION_RE = /^\d{1,2}:\d{2}(?::\d{2})?$/;
const COUNT_RE = /^\d+(?:\.\d+)?[万亿]?$/;

let likes = 0;
let author = '';
let longest = '';

for (let i = 0; i < texts.length; i++) {
const t = texts[i];
if (DURATION_RE.test(t)) continue;
if (!likes && COUNT_RE.test(t)) {
likes = parseDouyinCount(t);
continue;
}
if (t === '@' && !author) {
author = (texts[i + 1] ?? '').trim();
continue;
}
if (t === author) continue;
if (t.length > longest.length) longest = t;
}
let desc = longest;
// Strip a leading "@author" that some renders fuse into the desc text node.
if (author && desc.startsWith('@' + author)) {
desc = desc.slice(author.length + 1).trim();
}
return {
rank: index + 1,
desc,
author,
url,
plays: 0,
likes,
comments: 0,
shares: 0,
};
}

// JS snippet that waits for the scroll-list to populate, then returns
// `{state: 'rendered', cards}` or `{state: 'login_wall'}` /
// `{state: 'timeout'}`. Runs inside page.evaluate so we don't pay a
// round-trip per poll iteration.
const WAIT_AND_EXTRACT_JS = (timeoutMs) => `
new Promise((resolve) => {
const collectCards = () => {
const cards = [];
const lis = document.querySelectorAll('[data-e2e="scroll-list"] li');
for (const li of lis) {
const a = li.querySelector('a[href*="/video/"]');
if (!a) continue;
const leafTexts = [];
for (const el of li.querySelectorAll('*')) {
if (el.children.length > 0) continue;
const t = (el.textContent || '').trim();
if (t) leafTexts.push(t);
}
cards.push({ url: a.getAttribute('href') || '', leafTexts });
}
return cards;
};
const detectState = () => {
const cards = collectCards();
if (cards.length > 0) return { state: 'rendered', cards };
// Anonymous gate: Douyin renders a centered "登录后查看更多内容"
// overlay on /search/ for visitors without sessionid. Match either
// the literal Chinese prompt or a visible login modal/mask.
const text = (document.body && document.body.innerText) || '';
if (/登录后查看|请先登录|登录抖音/.test(text)) return { state: 'login_wall' };
const modal = document.querySelector('[class*="login-mask"], [class*="LoginMask"], [class*="login-modal"], dialog[role="dialog"]');
if (modal && modal instanceof HTMLElement) {
const r = modal.getBoundingClientRect();
const s = getComputedStyle(modal);
if (r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden') {
return { state: 'login_wall' };
}
}
return null;
};
const found = detectState();
if (found) return resolve(found);
const observer = new MutationObserver(() => {
const s = detectState();
if (s) { observer.disconnect(); resolve(s); }
});
observer.observe(document.body, { childList: true, subtree: true });
setTimeout(() => {
observer.disconnect();
const fallback = detectState();
resolve(fallback ?? { state: 'timeout' });
}, ${timeoutMs});
})
`;

function unwrapEvaluateResult(payload) {
if (payload && !Array.isArray(payload) && typeof payload === 'object' && 'session' in payload && 'data' in payload) {
return payload.data;
}
return payload;
}

cli({
site: 'douyin',
name: 'search',
access: 'read',
description: '关键词搜索抖音视频',
domain: 'www.douyin.com',
strategy: Strategy.COOKIE,
args: [
{ name: 'query', required: true, positional: true, help: '搜索关键词' },
{ name: 'limit', type: 'int', default: 10, help: `结果数量 (1-${MAX_SEARCH_LIMIT})` },
],
columns: ['rank', 'desc', 'author', 'url', 'plays', 'likes', 'comments', 'shares'],
func: async (page, kwargs) => {
const limit = parseSearchLimit(kwargs.limit);
const keyword = String(kwargs.query ?? '').trim();
if (!keyword) {
throw new ArgumentError('douyin search 需要 <query> 关键词');
}
await page.goto(`https://www.douyin.com/search/${encodeURIComponent(keyword)}?type=video`);
let result;
try {
result = unwrapEvaluateResult(await page.evaluate(WAIT_AND_EXTRACT_JS(RENDER_TIMEOUT_MS)));
} catch (error) {
throw new CommandExecutionError(`Douyin search extraction failed: ${error instanceof Error ? error.message : String(error)}`);
}
if (!result || typeof result !== 'object') {
throw new CommandExecutionError('Douyin search: unexpected evaluator payload shape');
}
if (result.state === 'login_wall') {
throw new AuthRequiredError(
'www.douyin.com',
'Douyin search results are blocked behind a login wall — log in at https://www.douyin.com in Chrome first.',
);
}
if (result.state === 'timeout' || !Array.isArray(result.cards) || result.cards.length === 0) {
// No cards rendered within the budget AND no explicit login
// wall detected. Most common cause is still an unauthenticated
// session (the page just hides results silently); surface as
// AuthRequiredError with the same actionable message.
throw new AuthRequiredError(
'www.douyin.com',
'Douyin search returned no results. Log in to https://www.douyin.com in Chrome — anonymous sessions get an empty results page without a visible login prompt.',
);
}
return result.cards.slice(0, limit).map((card, index) => projectCard(card, index));
},
});
Loading