|
1 | 1 | import { cli, Strategy } from '@jackwener/opencli/registry'; |
2 | 2 | import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; |
| 3 | +import { stripHtml as stripHtmlText } from './text.js'; |
3 | 4 |
|
4 | | -// Light-weight HTML → text, preserving paragraph / heading / list-item |
5 | | -// line breaks. Zhihu answer `content` is HTML, so we map block-level |
6 | | -// closing tags + `<br>` to newlines before stripping the rest. |
7 | 5 | function stripHtml(html) { |
8 | | - if (!html) return ''; |
9 | | - return html |
10 | | - .replace(/<br\s*\/?\s*>/gi, '\n') |
11 | | - // Block-level closing tags become paragraph breaks (double |
12 | | - // newline) so the stripped text stays readable. The trailing |
13 | | - // `\n{3,}` collapse pass below normalizes accidental triples. |
14 | | - .replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n') |
15 | | - .replace(/<[^>]+>/g, '') |
16 | | - .replace(/ /g, ' ') |
17 | | - .replace(/</g, '<') |
18 | | - .replace(/>/g, '>') |
19 | | - .replace(/&/g, '&') |
20 | | - .replace(/"/g, '"') |
21 | | - .replace(/'/g, "'") |
22 | | - .replace(/&#(\d+);/g, (_, value) => { |
23 | | - const codePoint = Number(value); |
24 | | - return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF |
25 | | - ? String.fromCodePoint(codePoint) |
26 | | - : _; |
27 | | - }) |
28 | | - .replace(/&#x([0-9a-f]+);/gi, (_, value) => { |
29 | | - const codePoint = Number.parseInt(value, 16); |
30 | | - return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF |
31 | | - ? String.fromCodePoint(codePoint) |
32 | | - : _; |
33 | | - }) |
34 | | - .replace(/\n{3,}/g, '\n\n') |
35 | | - .trim(); |
| 6 | + return stripHtmlText(html, { preserveBlocks: true }); |
36 | 7 | } |
37 | 8 |
|
38 | 9 | const ANSWER_ID_RE = /^\d+$/; |
|
0 commit comments