Skip to content

Commit 000c867

Browse files
fix(zhihu): harden search pagination (#1615)
Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent 24f643a commit 000c867

5 files changed

Lines changed: 426 additions & 51 deletions

File tree

cli-manifest.json

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29835,7 +29835,20 @@
2983529835
"type": "int",
2983629836
"default": 10,
2983729837
"required": false,
29838-
"help": "Number of results"
29838+
"help": "Number of results (max 1000; use normal-sized requests)"
29839+
},
29840+
{
29841+
"name": "type",
29842+
"type": "str",
29843+
"default": "all",
29844+
"required": false,
29845+
"help": "Result type: all, answer, article, or question",
29846+
"choices": [
29847+
"all",
29848+
"answer",
29849+
"article",
29850+
"question"
29851+
]
2983929852
}
2984029853
],
2984129854
"columns": [

clis/zhihu/search.js

Lines changed: 206 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,217 @@
1-
import { cli } from '@jackwener/opencli/registry';
1+
import { cli, Strategy } from '@jackwener/opencli/registry';
2+
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
3+
4+
function stripHtml(html) {
5+
return (html || '')
6+
.replace(/<[^>]+>/g, '')
7+
.replace(/&nbsp;/g, ' ')
8+
.replace(/&lt;/g, '<')
9+
.replace(/&gt;/g, '>')
10+
.replace(/&amp;/g, '&')
11+
.replace(/<em>/g, '')
12+
.replace(/<\/em>/g, '')
13+
.trim();
14+
}
15+
16+
function itemKey(item) {
17+
const obj = item.object || {};
18+
if (obj.id != null) return `${obj.type || ''}:${obj.id}`;
19+
return null;
20+
}
21+
22+
function itemUrl(obj) {
23+
const id = obj.id == null ? '' : String(obj.id);
24+
if (obj.type === 'answer') {
25+
const questionId = obj.question?.id == null ? '' : String(obj.question.id);
26+
return questionId && id ? `https://www.zhihu.com/question/${questionId}/answer/${id}` : '';
27+
}
28+
if (obj.type === 'article') {
29+
return id ? `https://zhuanlan.zhihu.com/p/${id}` : '';
30+
}
31+
if (obj.type === 'question') {
32+
return id ? `https://www.zhihu.com/question/${id}` : '';
33+
}
34+
return '';
35+
}
36+
37+
function normalizeSearchUrl(url) {
38+
if (typeof url !== 'string' || !url) return '';
39+
try {
40+
const parsed = new URL(url);
41+
if (parsed.hostname === 'api.zhihu.com' && parsed.pathname === '/search_v3') {
42+
return `https://www.zhihu.com/api/v4/search_v3${parsed.search}`;
43+
}
44+
if (parsed.hostname === 'www.zhihu.com' && parsed.pathname === '/api/v4/search_v3') {
45+
return parsed.toString();
46+
}
47+
} catch {
48+
return '';
49+
}
50+
return '';
51+
}
52+
53+
const MAX_LIMIT = 1000;
54+
const PAGE_SIZE = 20;
55+
const TYPES = ['all', 'answer', 'article', 'question'];
56+
57+
function parseLimit(value) {
58+
const limit = Number(value ?? 10);
59+
if (!Number.isInteger(limit) || limit <= 0 || limit > MAX_LIMIT) {
60+
throw new ArgumentError(`zhihu search --limit must be a positive integer no greater than ${MAX_LIMIT}`, 'Use a normal-sized limit to avoid slow requests or Zhihu risk controls');
61+
}
62+
return limit;
63+
}
64+
65+
function requireQuery(value) {
66+
const query = String(value || '').trim();
67+
if (!query) {
68+
throw new ArgumentError('zhihu search query must not be empty', 'Example: opencli zhihu search codex');
69+
}
70+
return query;
71+
}
72+
73+
function requireType(value) {
74+
const type = String(value || 'all');
75+
if (!TYPES.includes(type)) {
76+
throw new ArgumentError(`zhihu search --type must be one of: ${TYPES.join(', ')}`, 'Example: opencli zhihu search codex --type answer');
77+
}
78+
return type;
79+
}
80+
81+
function unwrapEvaluateResult(payload) {
82+
if (payload && typeof payload === 'object' && 'data' in payload && 'session' in payload) return payload.data;
83+
return payload;
84+
}
85+
86+
function requireSearchPayload(data, url) {
87+
const payload = unwrapEvaluateResult(data);
88+
if (!payload || typeof payload !== 'object' || Array.isArray(payload)) {
89+
throw new CommandExecutionError('Zhihu search returned malformed payload');
90+
}
91+
if (payload.__httpError) {
92+
const status = payload.__httpError;
93+
if (status === 401 || status === 403) {
94+
throw new AuthRequiredError('www.zhihu.com', 'Failed to fetch search results from Zhihu');
95+
}
96+
throw new CommandExecutionError(`Zhihu search request failed${status ? ` (HTTP ${status})` : ''}`, 'Try again later or rerun with -v for more detail');
97+
}
98+
if (payload.__fetchError) {
99+
throw new CommandExecutionError('Zhihu search request failed', String(payload.__fetchError));
100+
}
101+
if (!Array.isArray(payload.data)) {
102+
throw new CommandExecutionError('Zhihu search returned malformed data list', `URL: ${url}`);
103+
}
104+
if (!payload.paging || typeof payload.paging !== 'object') {
105+
throw new CommandExecutionError('Zhihu search returned malformed paging data', `URL: ${url}`);
106+
}
107+
return payload;
108+
}
109+
110+
function normalizeResultItem(item) {
111+
if (!item || typeof item !== 'object' || item.type !== 'search_result' || !item.object || typeof item.object !== 'object') {
112+
return null;
113+
}
114+
const obj = item.object;
115+
if (obj.type !== 'answer' && obj.type !== 'article' && obj.type !== 'question') return null;
116+
const key = itemKey(item);
117+
const url = itemUrl(obj);
118+
const question = obj.question || {};
119+
const title = stripHtml(obj.title || question.name || question.title || '');
120+
if (!key || !url || !title) {
121+
throw new CommandExecutionError('Zhihu search returned malformed result row identity');
122+
}
123+
return {
124+
item,
125+
key,
126+
row: {
127+
title,
128+
type: obj.type,
129+
author: obj.author?.name || '',
130+
votes: obj.voteup_count || 0,
131+
url,
132+
},
133+
};
134+
}
135+
2136
cli({
3137
site: 'zhihu',
4138
name: 'search',
5139
access: 'read',
6140
description: '知乎搜索',
7141
domain: 'www.zhihu.com',
142+
strategy: Strategy.COOKIE,
8143
args: [
9144
{ name: 'query', required: true, positional: true, help: 'Search query' },
10-
{ name: 'limit', type: 'int', default: 10, help: 'Number of results' },
145+
{ name: 'limit', type: 'int', default: 10, help: 'Number of results (max 1000; use normal-sized requests)' },
146+
{ name: 'type', default: 'all', choices: TYPES, help: 'Result type: all, answer, article, or question' },
11147
],
12148
columns: ['rank', 'title', 'type', 'author', 'votes', 'url'],
13-
pipeline: [
14-
{ navigate: 'https://www.zhihu.com' },
15-
{ evaluate: `(async () => {
16-
const strip = (html) => (html || '').replace(/<[^>]+>/g, '').replace(/&nbsp;/g, ' ').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&amp;/g, '&').replace(/<em>/g, '').replace(/<\\/em>/g, '').trim();
17-
const keyword = \${{ args.query | json }};
18-
const limit = \${{ args.limit }};
19-
var fetchLimit = Math.max(limit * 3, 30);
20-
const res = await fetch('https://www.zhihu.com/api/v4/search_v3?q=' + encodeURIComponent(keyword) + '&t=general&offset=0&limit=' + fetchLimit, {
21-
credentials: 'include'
22-
});
23-
const d = await res.json();
24-
return (d?.data || [])
25-
.filter(item => item.object && (item.object.type === 'answer' || item.object.type === 'article' || item.object.type === 'question'))
26-
.map(item => {
27-
const obj = item.object || {};
28-
const q = obj.question || {};
29-
return {
30-
type: obj.type,
31-
title: strip(obj.title || q.name || ''),
32-
excerpt: strip(obj.excerpt || '').substring(0, 100),
33-
author: obj.author?.name || '',
34-
votes: obj.voteup_count || 0,
35-
url: obj.type === 'answer'
36-
? 'https://www.zhihu.com/question/' + q.id + '/answer/' + obj.id
37-
: obj.type === 'article'
38-
? 'https://zhuanlan.zhihu.com/p/' + obj.id
39-
: 'https://www.zhihu.com/question/' + obj.id,
40-
};
41-
});
42-
})()
43-
` },
44-
{ map: {
45-
rank: '${{ index + 1 }}',
46-
title: '${{ item.title }}',
47-
type: '${{ item.type }}',
48-
author: '${{ item.author }}',
49-
votes: '${{ item.votes }}',
50-
url: '${{ item.url }}',
51-
} },
52-
{ limit: '${{ args.limit }}' },
53-
],
149+
func: async (page, kwargs) => {
150+
const query = requireQuery(kwargs.query);
151+
const resultLimit = parseLimit(kwargs.limit);
152+
const type = requireType(kwargs.type);
153+
await page.goto('https://www.zhihu.com');
154+
let url = 'https://www.zhihu.com/api/v4/search_v3'
155+
+ `?q=${encodeURIComponent(query)}&t=general&offset=0&limit=${PAGE_SIZE}`;
156+
const results = [];
157+
const seen = new Set();
158+
const visited = new Set();
159+
while (url && results.length < resultLimit && !visited.has(url)) {
160+
visited.add(url);
161+
const data = requireSearchPayload(await page.evaluate(`
162+
(async () => {
163+
try {
164+
const r = await fetch(${JSON.stringify(url)}, { credentials: 'include' });
165+
if (!r.ok) return { __httpError: r.status };
166+
return await r.json();
167+
} catch (err) {
168+
return { __fetchError: err?.message || String(err) };
169+
}
170+
})()
171+
`), url);
172+
for (const item of data.data) {
173+
const rawType = item?.object?.type;
174+
if (type !== 'all' && rawType && rawType !== type) continue;
175+
const normalized = normalizeResultItem(item);
176+
if (!normalized) continue;
177+
if (type !== 'all' && normalized.row.type !== type) continue;
178+
if (seen.has(normalized.key)) continue;
179+
seen.add(normalized.key);
180+
results.push(normalized.row);
181+
if (results.length >= resultLimit) break;
182+
}
183+
if (results.length >= resultLimit) break;
184+
if (data.paging?.is_end) break;
185+
const next = normalizeSearchUrl(data.paging?.next);
186+
if (!next) {
187+
throw new CommandExecutionError('Zhihu search pagination returned malformed next URL');
188+
}
189+
if (visited.has(next)) {
190+
throw new CommandExecutionError('Zhihu search pagination returned a repeated next URL');
191+
}
192+
url = next;
193+
}
194+
if (results.length === 0) {
195+
throw new EmptyResultError('zhihu search', `No ${type === 'all' ? '' : `${type} `}results found for "${query}"`);
196+
}
197+
return results.map((row, i) => {
198+
return {
199+
rank: i + 1,
200+
...row,
201+
};
202+
});
203+
},
54204
});
205+
206+
export const __test__ = {
207+
stripHtml,
208+
itemKey,
209+
itemUrl,
210+
normalizeSearchUrl,
211+
parseLimit,
212+
requireQuery,
213+
requireType,
214+
unwrapEvaluateResult,
215+
requireSearchPayload,
216+
normalizeResultItem,
217+
};

0 commit comments

Comments
 (0)