Skip to content

Commit 4fac911

Browse files
authored
feat(zhihu): add answer-detail to fetch a single answer's full content (#1528)
* feat(zhihu): add answer-detail to fetch a single answer's full content The existing `zhihu answer` adapter is a write (post an answer); the listing `zhihu question` truncates each answer's body to 200 chars. There was no way to fetch one specific answer's full content by id. New read adapter `zhihu answer-detail`: - Accepts a bare numeric answer id, a typed target `answer:<qid>:<aid>`, or a full Zhihu answer URL (the form you paste from a browser). - Calls `/api/v4/answers/<aid>?include=content,voteup_count,...,question` inside the cookie-bearing page context (Strategy.COOKIE). - Returns a single row with id / author / votes / comments / question_id / question_title / url / created_at / updated_at / content. The content column is the full stripped answer body by default — no silent truncation. `--max-content N` is an opt-in user cap (mirroring the wikipedia `page` flag), and `--max-content 0` (the default) means "no cap, full content". Important precision note: Zhihu answer ids since 2024 routinely exceed `Number.MAX_SAFE_INTEGER` (the test fixture uses the real id `1937205528846655537`). `data.id` is round-tripped through browser `JSON.parse` and would round to `1937205528846655500`, so the adapter deliberately ignores `data.id` for the canonical row id and anchors it to the already-validated input string instead. A regression test locks this contract in by mocking `data.id = 0` and asserting the row still carries the parsed input id. Typed errors: bad input → INVALID_INPUT; 401/403 → AuthRequiredError; other HTTP / null → FETCH_ERROR. No silent fallbacks, no sentinel strings. Live-verified against the example URL — fetched 5547 votes / 165 comments / 1937205528846655537-end-to-end. 16 unit tests, audits unchanged (typed-error-lint 189/189, silent-column-drop 103/103), manifest 816→817. * fix(zhihu): tighten answer-detail contracts
1 parent b52da63 commit 4fac911

5 files changed

Lines changed: 608 additions & 1 deletion

File tree

cli-manifest.json

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28212,6 +28212,47 @@
2821228212
"sourceFile": "zhihu/answer.js",
2821328213
"navigateBefore": "https://www.zhihu.com"
2821428214
},
28215+
{
28216+
"site": "zhihu",
28217+
"name": "answer-detail",
28218+
"description": "知乎单个回答完整内容(按 answer ID 获取)",
28219+
"access": "read",
28220+
"domain": "www.zhihu.com",
28221+
"strategy": "cookie",
28222+
"browser": true,
28223+
"args": [
28224+
{
28225+
"name": "id",
28226+
"type": "str",
28227+
"required": true,
28228+
"positional": true,
28229+
"help": "Answer ID, full Zhihu answer URL, or typed target (answer:<qid>:<aid>)"
28230+
},
28231+
{
28232+
"name": "max-content",
28233+
"type": "int",
28234+
"default": 0,
28235+
"required": false,
28236+
"help": "Optional cap on stripped content length in characters (0 = no truncation, return the full answer)"
28237+
}
28238+
],
28239+
"columns": [
28240+
"id",
28241+
"author",
28242+
"votes",
28243+
"comments",
28244+
"question_id",
28245+
"question_title",
28246+
"url",
28247+
"created_at",
28248+
"updated_at",
28249+
"content"
28250+
],
28251+
"type": "js",
28252+
"modulePath": "zhihu/answer-detail.js",
28253+
"sourceFile": "zhihu/answer-detail.js",
28254+
"navigateBefore": "https://www.zhihu.com"
28255+
},
2821528256
{
2821628257
"site": "zhihu",
2821728258
"name": "collection",

clis/zhihu/answer-detail.js

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
import { cli, Strategy } from '@jackwener/opencli/registry';
2+
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
3+
4+
// Light-weight HTML → text, preserving paragraph / heading / list-item
5+
// line breaks. Zhihu answer `content` is HTML, so we map block-level
6+
// closing tags + `<br>` to newlines before stripping the rest.
7+
function stripHtml(html) {
8+
if (!html) return '';
9+
return html
10+
.replace(/<br\s*\/?\s*>/gi, '\n')
11+
// Block-level closing tags become paragraph breaks (double
12+
// newline) so the stripped text stays readable. The trailing
13+
// `\n{3,}` collapse pass below normalizes accidental triples.
14+
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n')
15+
.replace(/<[^>]+>/g, '')
16+
.replace(/&nbsp;/g, ' ')
17+
.replace(/&lt;/g, '<')
18+
.replace(/&gt;/g, '>')
19+
.replace(/&amp;/g, '&')
20+
.replace(/&quot;/g, '"')
21+
.replace(/&#39;/g, "'")
22+
.replace(/\n{3,}/g, '\n\n')
23+
.trim();
24+
}
25+
26+
const ANSWER_ID_RE = /^\d+$/;
27+
const ANSWER_TYPED_RE = /^answer:(\d+):(\d+)$/;
28+
const ANSWER_PATH_RE = /^\/question\/(\d+)\/answer\/(\d+)\/?$/;
29+
const BARE_ANSWER_PATH_RE = /^\/answer\/(\d+)\/?$/;
30+
const QUESTION_PATH_RE = /^\/question\/(\d+)\/?$/;
31+
const QUESTION_API_PATH_RE = /^\/api\/v4\/questions\/(\d+)\/?$/;
32+
33+
// Accepts: bare numeric id (`1937205528846655537`), the typed
34+
// target form used by the existing zhihu write adapters
35+
// (`answer:<qid>:<aid>`), or the full Zhihu URL pasted from a
36+
// browser (`https://www.zhihu.com/question/<qid>/answer/<aid>`).
37+
// Returns string-safe ids, or null when the input does not resolve to
38+
// any of those exact shapes.
39+
function parseAnswerTarget(input) {
40+
const value = String(input ?? '').trim();
41+
if (!value) return null;
42+
if (ANSWER_ID_RE.test(value)) return { answerId: value, questionId: '' };
43+
const typed = value.match(ANSWER_TYPED_RE);
44+
if (typed) return { questionId: typed[1], answerId: typed[2] };
45+
try {
46+
const url = new URL(value);
47+
if (
48+
url.protocol !== 'https:' ||
49+
url.username ||
50+
url.password ||
51+
url.port ||
52+
(url.hostname !== 'www.zhihu.com' && url.hostname !== 'zhihu.com')
53+
) {
54+
return null;
55+
}
56+
let m = url.pathname.match(ANSWER_PATH_RE);
57+
if (m) return { questionId: m[1], answerId: m[2] };
58+
m = url.pathname.match(BARE_ANSWER_PATH_RE);
59+
if (m) return { answerId: m[1], questionId: '' };
60+
} catch {
61+
return null;
62+
}
63+
return null;
64+
}
65+
66+
function extractAnswerId(input) {
67+
return parseAnswerTarget(input)?.answerId ?? null;
68+
}
69+
70+
function extractQuestionIdFromAnswerUrl(input) {
71+
const value = String(input ?? '').trim();
72+
if (!value) return '';
73+
try {
74+
const url = new URL(value);
75+
if (url.protocol !== 'https:' || (url.hostname !== 'www.zhihu.com' && url.hostname !== 'zhihu.com')) {
76+
return '';
77+
}
78+
return url.pathname.match(ANSWER_PATH_RE)?.[1]
79+
|| url.pathname.match(QUESTION_PATH_RE)?.[1]
80+
|| url.pathname.match(QUESTION_API_PATH_RE)?.[1]
81+
|| '';
82+
} catch {
83+
return '';
84+
}
85+
}
86+
87+
function normalizeCount(value) {
88+
return Number.isInteger(value) && value >= 0 ? value : 0;
89+
}
90+
91+
function normalizeUnixSeconds(value) {
92+
return typeof value === 'number' && Number.isFinite(value) && value > 0
93+
? new Date(value * 1000).toISOString()
94+
: '';
95+
}
96+
97+
cli({
98+
site: 'zhihu',
99+
name: 'answer-detail',
100+
access: 'read',
101+
description: '知乎单个回答完整内容(按 answer ID 获取)',
102+
domain: 'www.zhihu.com',
103+
strategy: Strategy.COOKIE,
104+
args: [
105+
{ name: 'id', required: true, positional: true, help: 'Answer ID, full Zhihu answer URL, or typed target (answer:<qid>:<aid>)' },
106+
{ name: 'max-content', type: 'int', default: 0, help: 'Optional cap on stripped content length in characters (0 = no truncation, return the full answer)' },
107+
],
108+
columns: ['id', 'author', 'votes', 'comments', 'question_id', 'question_title', 'url', 'created_at', 'updated_at', 'content'],
109+
func: async (page, kwargs) => {
110+
const target = parseAnswerTarget(kwargs.id);
111+
if (!target) {
112+
throw new ArgumentError(
113+
'Answer ID must be a numeric id, a Zhihu answer URL, or answer:<qid>:<aid>',
114+
'Example: opencli zhihu answer-detail 1937205528846655537',
115+
);
116+
}
117+
const { answerId } = target;
118+
// `--max-content 0` (the default) means "no cap, return the
119+
// full stripped answer". Any positive value is an opt-in user
120+
// cap, mirroring the wikipedia `page` pattern — we never
121+
// silently truncate behind the user's back.
122+
const rawMaxContent = kwargs['max-content'];
123+
const maxContent = rawMaxContent == null ? 0 : Number(rawMaxContent);
124+
if (!Number.isInteger(maxContent) || maxContent < 0) {
125+
throw new ArgumentError(
126+
'--max-content must be a non-negative integer (0 = no cap, full content)',
127+
'Example: --max-content 2000',
128+
);
129+
}
130+
// Navigate to the answer page itself: this both seeds the
131+
// cookie/anti-bot context and works even when the caller did
132+
// not supply the parent question id (Zhihu redirects from
133+
// `/answer/<aid>` to the canonical `/question/<qid>/answer/<aid>`).
134+
try {
135+
await page.goto(`https://www.zhihu.com/answer/${answerId}`);
136+
} catch (err) {
137+
throw new CommandExecutionError(
138+
`Failed to open Zhihu answer ${answerId}: ${err instanceof Error ? err.message : String(err)}`,
139+
'Open the answer URL in Chrome and retry after the page is reachable.',
140+
);
141+
}
142+
const currentQuestionId = page.getCurrentUrl
143+
? extractQuestionIdFromAnswerUrl(await page.getCurrentUrl().catch(() => ''))
144+
: '';
145+
const apiUrl = `https://www.zhihu.com/api/v4/answers/${answerId}?include=content,voteup_count,comment_count,author,created_time,updated_time,question`;
146+
const data = await page.evaluate(`
147+
(async () => {
148+
const r = await fetch(${JSON.stringify(apiUrl)}, { credentials: 'include' });
149+
if (!r.ok) return { __httpError: r.status };
150+
try {
151+
return await r.json();
152+
} catch (error) {
153+
return { __malformedJson: error instanceof Error ? error.message : String(error) };
154+
}
155+
})()
156+
`).catch((err) => {
157+
throw new CommandExecutionError(
158+
`Zhihu answer detail request failed: ${err instanceof Error ? err.message : String(err)}`,
159+
'Try again later or rerun with -v for more detail.',
160+
);
161+
});
162+
if (!data || data.__httpError) {
163+
const status = data?.__httpError;
164+
if (status === 401 || status === 403) {
165+
throw new AuthRequiredError('www.zhihu.com', 'Failed to fetch Zhihu answer detail');
166+
}
167+
if (status === 404) {
168+
throw new EmptyResultError('zhihu answer-detail', `No Zhihu answer was found for ${answerId}.`);
169+
}
170+
throw new CommandExecutionError(
171+
status
172+
? `Zhihu answer detail request failed (HTTP ${status})`
173+
: 'Zhihu answer detail request failed',
174+
'Try again later or rerun with -v for more detail',
175+
);
176+
}
177+
if (data.__malformedJson) {
178+
throw new CommandExecutionError(
179+
`Zhihu answer detail returned malformed JSON: ${data.__malformedJson}`,
180+
'Try again later or rerun with -v for more detail',
181+
);
182+
}
183+
if (typeof data !== 'object' || Array.isArray(data)) {
184+
throw new CommandExecutionError(
185+
'Zhihu answer detail returned a malformed payload',
186+
'Try again later or rerun with -v for more detail',
187+
);
188+
}
189+
if (data.error || data.error_msg || data.message) {
190+
throw new CommandExecutionError(
191+
`Zhihu answer detail returned an error payload: ${data.error?.message || data.error_msg || data.message}`,
192+
'Try again later or rerun with -v for more detail',
193+
);
194+
}
195+
if (!Object.prototype.hasOwnProperty.call(data, 'content')) {
196+
throw new CommandExecutionError(
197+
'Zhihu answer detail payload did not include answer content',
198+
'Try again later or rerun with -v for more detail',
199+
);
200+
}
201+
const question = data.question || {};
202+
// Answer ids and newer question ids can exceed
203+
// Number.MAX_SAFE_INTEGER. Prefer ids parsed from user input or
204+
// the canonical redirected URL; only fall back to API numeric ids
205+
// when no string-safe source is available.
206+
const questionId = target.questionId
207+
|| currentQuestionId
208+
|| extractQuestionIdFromAnswerUrl(question.url)
209+
|| (question.id == null ? '' : String(question.id));
210+
const stripped = stripHtml(data.content || '');
211+
// Truncation is opt-in only; default `maxContent === 0` short-
212+
// circuits the conditional so the full stripped body is returned.
213+
const content = maxContent > 0 && stripped.length > maxContent
214+
? stripped.substring(0, maxContent)
215+
: stripped;
216+
return [{
217+
id: answerId,
218+
author: data.author?.name || 'anonymous',
219+
votes: normalizeCount(data.voteup_count),
220+
comments: normalizeCount(data.comment_count),
221+
question_id: questionId,
222+
question_title: question.title || '',
223+
url: questionId
224+
? `https://www.zhihu.com/question/${questionId}/answer/${answerId}`
225+
: `https://www.zhihu.com/answer/${answerId}`,
226+
created_at: normalizeUnixSeconds(data.created_time),
227+
updated_at: normalizeUnixSeconds(data.updated_time),
228+
content,
229+
}];
230+
},
231+
});
232+
233+
export const __test__ = { stripHtml, extractAnswerId, parseAnswerTarget, extractQuestionIdFromAnswerUrl };

0 commit comments

Comments
 (0)