|
| 1 | +import { cli, Strategy } from '@jackwener/opencli/registry'; |
| 2 | +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; |
| 3 | + |
| 4 | +// Light-weight HTML → text, preserving paragraph / heading / list-item |
| 5 | +// line breaks. Zhihu answer `content` is HTML, so we map block-level |
| 6 | +// closing tags + `<br>` to newlines before stripping the rest. |
| 7 | +function stripHtml(html) { |
| 8 | + if (!html) return ''; |
| 9 | + return html |
| 10 | + .replace(/<br\s*\/?\s*>/gi, '\n') |
| 11 | + // Block-level closing tags become paragraph breaks (double |
| 12 | + // newline) so the stripped text stays readable. The trailing |
| 13 | + // `\n{3,}` collapse pass below normalizes accidental triples. |
| 14 | + .replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n') |
| 15 | + .replace(/<[^>]+>/g, '') |
| 16 | + .replace(/ /g, ' ') |
| 17 | + .replace(/</g, '<') |
| 18 | + .replace(/>/g, '>') |
| 19 | + .replace(/&/g, '&') |
| 20 | + .replace(/"/g, '"') |
| 21 | + .replace(/'/g, "'") |
| 22 | + .replace(/\n{3,}/g, '\n\n') |
| 23 | + .trim(); |
| 24 | +} |
| 25 | + |
| 26 | +const ANSWER_ID_RE = /^\d+$/; |
| 27 | +const ANSWER_TYPED_RE = /^answer:(\d+):(\d+)$/; |
| 28 | +const ANSWER_PATH_RE = /^\/question\/(\d+)\/answer\/(\d+)\/?$/; |
| 29 | +const BARE_ANSWER_PATH_RE = /^\/answer\/(\d+)\/?$/; |
| 30 | +const QUESTION_PATH_RE = /^\/question\/(\d+)\/?$/; |
| 31 | +const QUESTION_API_PATH_RE = /^\/api\/v4\/questions\/(\d+)\/?$/; |
| 32 | + |
| 33 | +// Accepts: bare numeric id (`1937205528846655537`), the typed |
| 34 | +// target form used by the existing zhihu write adapters |
| 35 | +// (`answer:<qid>:<aid>`), or the full Zhihu URL pasted from a |
| 36 | +// browser (`https://www.zhihu.com/question/<qid>/answer/<aid>`). |
| 37 | +// Returns string-safe ids, or null when the input does not resolve to |
| 38 | +// any of those exact shapes. |
| 39 | +function parseAnswerTarget(input) { |
| 40 | + const value = String(input ?? '').trim(); |
| 41 | + if (!value) return null; |
| 42 | + if (ANSWER_ID_RE.test(value)) return { answerId: value, questionId: '' }; |
| 43 | + const typed = value.match(ANSWER_TYPED_RE); |
| 44 | + if (typed) return { questionId: typed[1], answerId: typed[2] }; |
| 45 | + try { |
| 46 | + const url = new URL(value); |
| 47 | + if ( |
| 48 | + url.protocol !== 'https:' || |
| 49 | + url.username || |
| 50 | + url.password || |
| 51 | + url.port || |
| 52 | + (url.hostname !== 'www.zhihu.com' && url.hostname !== 'zhihu.com') |
| 53 | + ) { |
| 54 | + return null; |
| 55 | + } |
| 56 | + let m = url.pathname.match(ANSWER_PATH_RE); |
| 57 | + if (m) return { questionId: m[1], answerId: m[2] }; |
| 58 | + m = url.pathname.match(BARE_ANSWER_PATH_RE); |
| 59 | + if (m) return { answerId: m[1], questionId: '' }; |
| 60 | + } catch { |
| 61 | + return null; |
| 62 | + } |
| 63 | + return null; |
| 64 | +} |
| 65 | + |
| 66 | +function extractAnswerId(input) { |
| 67 | + return parseAnswerTarget(input)?.answerId ?? null; |
| 68 | +} |
| 69 | + |
| 70 | +function extractQuestionIdFromAnswerUrl(input) { |
| 71 | + const value = String(input ?? '').trim(); |
| 72 | + if (!value) return ''; |
| 73 | + try { |
| 74 | + const url = new URL(value); |
| 75 | + if (url.protocol !== 'https:' || (url.hostname !== 'www.zhihu.com' && url.hostname !== 'zhihu.com')) { |
| 76 | + return ''; |
| 77 | + } |
| 78 | + return url.pathname.match(ANSWER_PATH_RE)?.[1] |
| 79 | + || url.pathname.match(QUESTION_PATH_RE)?.[1] |
| 80 | + || url.pathname.match(QUESTION_API_PATH_RE)?.[1] |
| 81 | + || ''; |
| 82 | + } catch { |
| 83 | + return ''; |
| 84 | + } |
| 85 | +} |
| 86 | + |
| 87 | +function normalizeCount(value) { |
| 88 | + return Number.isInteger(value) && value >= 0 ? value : 0; |
| 89 | +} |
| 90 | + |
| 91 | +function normalizeUnixSeconds(value) { |
| 92 | + return typeof value === 'number' && Number.isFinite(value) && value > 0 |
| 93 | + ? new Date(value * 1000).toISOString() |
| 94 | + : ''; |
| 95 | +} |
| 96 | + |
| 97 | +cli({ |
| 98 | + site: 'zhihu', |
| 99 | + name: 'answer-detail', |
| 100 | + access: 'read', |
| 101 | + description: '知乎单个回答完整内容(按 answer ID 获取)', |
| 102 | + domain: 'www.zhihu.com', |
| 103 | + strategy: Strategy.COOKIE, |
| 104 | + args: [ |
| 105 | + { name: 'id', required: true, positional: true, help: 'Answer ID, full Zhihu answer URL, or typed target (answer:<qid>:<aid>)' }, |
| 106 | + { name: 'max-content', type: 'int', default: 0, help: 'Optional cap on stripped content length in characters (0 = no truncation, return the full answer)' }, |
| 107 | + ], |
| 108 | + columns: ['id', 'author', 'votes', 'comments', 'question_id', 'question_title', 'url', 'created_at', 'updated_at', 'content'], |
| 109 | + func: async (page, kwargs) => { |
| 110 | + const target = parseAnswerTarget(kwargs.id); |
| 111 | + if (!target) { |
| 112 | + throw new ArgumentError( |
| 113 | + 'Answer ID must be a numeric id, a Zhihu answer URL, or answer:<qid>:<aid>', |
| 114 | + 'Example: opencli zhihu answer-detail 1937205528846655537', |
| 115 | + ); |
| 116 | + } |
| 117 | + const { answerId } = target; |
| 118 | + // `--max-content 0` (the default) means "no cap, return the |
| 119 | + // full stripped answer". Any positive value is an opt-in user |
| 120 | + // cap, mirroring the wikipedia `page` pattern — we never |
| 121 | + // silently truncate behind the user's back. |
| 122 | + const rawMaxContent = kwargs['max-content']; |
| 123 | + const maxContent = rawMaxContent == null ? 0 : Number(rawMaxContent); |
| 124 | + if (!Number.isInteger(maxContent) || maxContent < 0) { |
| 125 | + throw new ArgumentError( |
| 126 | + '--max-content must be a non-negative integer (0 = no cap, full content)', |
| 127 | + 'Example: --max-content 2000', |
| 128 | + ); |
| 129 | + } |
| 130 | + // Navigate to the answer page itself: this both seeds the |
| 131 | + // cookie/anti-bot context and works even when the caller did |
| 132 | + // not supply the parent question id (Zhihu redirects from |
| 133 | + // `/answer/<aid>` to the canonical `/question/<qid>/answer/<aid>`). |
| 134 | + try { |
| 135 | + await page.goto(`https://www.zhihu.com/answer/${answerId}`); |
| 136 | + } catch (err) { |
| 137 | + throw new CommandExecutionError( |
| 138 | + `Failed to open Zhihu answer ${answerId}: ${err instanceof Error ? err.message : String(err)}`, |
| 139 | + 'Open the answer URL in Chrome and retry after the page is reachable.', |
| 140 | + ); |
| 141 | + } |
| 142 | + const currentQuestionId = page.getCurrentUrl |
| 143 | + ? extractQuestionIdFromAnswerUrl(await page.getCurrentUrl().catch(() => '')) |
| 144 | + : ''; |
| 145 | + const apiUrl = `https://www.zhihu.com/api/v4/answers/${answerId}?include=content,voteup_count,comment_count,author,created_time,updated_time,question`; |
| 146 | + const data = await page.evaluate(` |
| 147 | + (async () => { |
| 148 | + const r = await fetch(${JSON.stringify(apiUrl)}, { credentials: 'include' }); |
| 149 | + if (!r.ok) return { __httpError: r.status }; |
| 150 | + try { |
| 151 | + return await r.json(); |
| 152 | + } catch (error) { |
| 153 | + return { __malformedJson: error instanceof Error ? error.message : String(error) }; |
| 154 | + } |
| 155 | + })() |
| 156 | + `).catch((err) => { |
| 157 | + throw new CommandExecutionError( |
| 158 | + `Zhihu answer detail request failed: ${err instanceof Error ? err.message : String(err)}`, |
| 159 | + 'Try again later or rerun with -v for more detail.', |
| 160 | + ); |
| 161 | + }); |
| 162 | + if (!data || data.__httpError) { |
| 163 | + const status = data?.__httpError; |
| 164 | + if (status === 401 || status === 403) { |
| 165 | + throw new AuthRequiredError('www.zhihu.com', 'Failed to fetch Zhihu answer detail'); |
| 166 | + } |
| 167 | + if (status === 404) { |
| 168 | + throw new EmptyResultError('zhihu answer-detail', `No Zhihu answer was found for ${answerId}.`); |
| 169 | + } |
| 170 | + throw new CommandExecutionError( |
| 171 | + status |
| 172 | + ? `Zhihu answer detail request failed (HTTP ${status})` |
| 173 | + : 'Zhihu answer detail request failed', |
| 174 | + 'Try again later or rerun with -v for more detail', |
| 175 | + ); |
| 176 | + } |
| 177 | + if (data.__malformedJson) { |
| 178 | + throw new CommandExecutionError( |
| 179 | + `Zhihu answer detail returned malformed JSON: ${data.__malformedJson}`, |
| 180 | + 'Try again later or rerun with -v for more detail', |
| 181 | + ); |
| 182 | + } |
| 183 | + if (typeof data !== 'object' || Array.isArray(data)) { |
| 184 | + throw new CommandExecutionError( |
| 185 | + 'Zhihu answer detail returned a malformed payload', |
| 186 | + 'Try again later or rerun with -v for more detail', |
| 187 | + ); |
| 188 | + } |
| 189 | + if (data.error || data.error_msg || data.message) { |
| 190 | + throw new CommandExecutionError( |
| 191 | + `Zhihu answer detail returned an error payload: ${data.error?.message || data.error_msg || data.message}`, |
| 192 | + 'Try again later or rerun with -v for more detail', |
| 193 | + ); |
| 194 | + } |
| 195 | + if (!Object.prototype.hasOwnProperty.call(data, 'content')) { |
| 196 | + throw new CommandExecutionError( |
| 197 | + 'Zhihu answer detail payload did not include answer content', |
| 198 | + 'Try again later or rerun with -v for more detail', |
| 199 | + ); |
| 200 | + } |
| 201 | + const question = data.question || {}; |
| 202 | + // Answer ids and newer question ids can exceed |
| 203 | + // Number.MAX_SAFE_INTEGER. Prefer ids parsed from user input or |
| 204 | + // the canonical redirected URL; only fall back to API numeric ids |
| 205 | + // when no string-safe source is available. |
| 206 | + const questionId = target.questionId |
| 207 | + || currentQuestionId |
| 208 | + || extractQuestionIdFromAnswerUrl(question.url) |
| 209 | + || (question.id == null ? '' : String(question.id)); |
| 210 | + const stripped = stripHtml(data.content || ''); |
| 211 | + // Truncation is opt-in only; default `maxContent === 0` short- |
| 212 | + // circuits the conditional so the full stripped body is returned. |
| 213 | + const content = maxContent > 0 && stripped.length > maxContent |
| 214 | + ? stripped.substring(0, maxContent) |
| 215 | + : stripped; |
| 216 | + return [{ |
| 217 | + id: answerId, |
| 218 | + author: data.author?.name || 'anonymous', |
| 219 | + votes: normalizeCount(data.voteup_count), |
| 220 | + comments: normalizeCount(data.comment_count), |
| 221 | + question_id: questionId, |
| 222 | + question_title: question.title || '', |
| 223 | + url: questionId |
| 224 | + ? `https://www.zhihu.com/question/${questionId}/answer/${answerId}` |
| 225 | + : `https://www.zhihu.com/answer/${answerId}`, |
| 226 | + created_at: normalizeUnixSeconds(data.created_time), |
| 227 | + updated_at: normalizeUnixSeconds(data.updated_time), |
| 228 | + content, |
| 229 | + }]; |
| 230 | + }, |
| 231 | +}); |
| 232 | + |
| 233 | +export const __test__ = { stripHtml, extractAnswerId, parseAnswerTarget, extractQuestionIdFromAnswerUrl }; |
0 commit comments