Skip to content

Commit 7362ced

Browse files
lenovobenbenlihaidongjackwener
authored
fix(zhihu): decode numeric entities in text output (jackwener#1695)
* fix(zhihu): decode numeric entities in text output * fix(zhihu): decode collection titles --------- Co-authored-by: lihaidong <lihaidong@kingsoft.com> Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent 6b8d30b commit 7362ced

10 files changed

Lines changed: 69 additions & 94 deletions

File tree

clis/zhihu/answer-comments.js

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,9 @@
11
import { cli, Strategy } from '@jackwener/opencli/registry';
22
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
3-
4-
function decodeEntity(codePoint) {
5-
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
6-
? String.fromCodePoint(codePoint)
7-
: null;
8-
}
3+
import { stripHtml as stripHtmlText } from './text.js';
94

105
function stripHtml(html) {
11-
if (!html) return '';
12-
return html
13-
.replace(/<br\s*\/?\s*>/gi, '\n')
14-
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n')
15-
.replace(/<[^>]+>/g, '')
16-
.replace(/&nbsp;/g, ' ')
17-
.replace(/&lt;/g, '<')
18-
.replace(/&gt;/g, '>')
19-
.replace(/&amp;/g, '&')
20-
.replace(/&quot;/g, '"')
21-
.replace(/&#39;/g, "'")
22-
.replace(/&#(\d+);/g, (entity, value) => decodeEntity(Number(value)) ?? entity)
23-
.replace(/&#x([0-9a-f]+);/gi, (entity, value) => decodeEntity(Number.parseInt(value, 16)) ?? entity)
24-
.replace(/\n{3,}/g, '\n\n')
25-
.trim();
6+
return stripHtmlText(html, { preserveBlocks: true });
267
}
278

289
const ANSWER_ID_RE = /^\d+$/;

clis/zhihu/answer-detail.js

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,9 @@
11
import { cli, Strategy } from '@jackwener/opencli/registry';
22
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
3+
import { stripHtml as stripHtmlText } from './text.js';
34

4-
// Light-weight HTML → text, preserving paragraph / heading / list-item
5-
// line breaks. Zhihu answer `content` is HTML, so we map block-level
6-
// closing tags + `<br>` to newlines before stripping the rest.
75
function stripHtml(html) {
8-
if (!html) return '';
9-
return html
10-
.replace(/<br\s*\/?\s*>/gi, '\n')
11-
// Block-level closing tags become paragraph breaks (double
12-
// newline) so the stripped text stays readable. The trailing
13-
// `\n{3,}` collapse pass below normalizes accidental triples.
14-
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n')
15-
.replace(/<[^>]+>/g, '')
16-
.replace(/&nbsp;/g, ' ')
17-
.replace(/&lt;/g, '<')
18-
.replace(/&gt;/g, '>')
19-
.replace(/&amp;/g, '&')
20-
.replace(/&quot;/g, '"')
21-
.replace(/&#39;/g, "'")
22-
.replace(/&#(\d+);/g, (_, value) => {
23-
const codePoint = Number(value);
24-
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
25-
? String.fromCodePoint(codePoint)
26-
: _;
27-
})
28-
.replace(/&#x([0-9a-f]+);/gi, (_, value) => {
29-
const codePoint = Number.parseInt(value, 16);
30-
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
31-
? String.fromCodePoint(codePoint)
32-
: _;
33-
})
34-
.replace(/\n{3,}/g, '\n\n')
35-
.trim();
6+
return stripHtmlText(html, { preserveBlocks: true });
367
}
378

389
const ANSWER_ID_RE = /^\d+$/;

clis/zhihu/collection.js

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,7 @@
11
import { cli, Strategy } from '@jackwener/opencli/registry';
22
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
33
import { log } from '@jackwener/opencli/logger';
4-
5-
function stripHtml(html) {
6-
return html
7-
.replace(/<[^>]+>/g, '')
8-
.replace(/&nbsp;/g, ' ')
9-
.replace(/&lt;/g, '<')
10-
.replace(/&gt;/g, '>')
11-
.replace(/&amp;/g, '&')
12-
.replace(/&quot;/g, '"')
13-
.replace(/<em>/g, '')
14-
.replace(/<\/em>/g, '')
15-
.trim();
16-
}
4+
import { stripHtml } from './text.js';
175

186
function validatePositiveInt(value, name) {
197
const n = Number(value);
@@ -106,7 +94,7 @@ function mapCollectionItem(item, rank) {
10694
return {
10795
rank,
10896
type,
109-
title: title.substring(0, 100),
97+
title: stripHtml(title).substring(0, 100),
11098
author,
11199
votes,
112100
excerpt,

clis/zhihu/collection.test.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ describe('zhihu collection', () => {
3434
content: {
3535
type: 'answer',
3636
id: 123456,
37-
question: { id: 789012, title: 'Test Question' },
37+
question: { id: 789012, title: '&#34;Test&#34; &#x26; Question' },
3838
author: { name: 'test_author' },
3939
voteup_count: 42,
40-
content: '<p>Test answer content</p>',
40+
content: '<p>&#34;Test&#34; &#x26; answer content</p>',
4141
url: 'https://www.zhihu.com/question/789012/answer/123456',
4242
},
4343
},
@@ -54,9 +54,10 @@ describe('zhihu collection', () => {
5454
expect(result[0]).toMatchObject({
5555
rank: 1,
5656
type: 'answer',
57-
title: 'Test Question',
57+
title: '"Test" & Question',
5858
author: 'test_author',
5959
votes: 42,
60+
excerpt: '"Test" & answer content',
6061
url: 'https://www.zhihu.com/question/789012/answer/123456',
6162
});
6263

clis/zhihu/question.js

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
11
import { cli, Strategy } from '@jackwener/opencli/registry';
22
import { AuthRequiredError, CliError } from '@jackwener/opencli/errors';
3-
function stripHtml(html) {
4-
return html
5-
.replace(/<[^>]+>/g, '')
6-
.replace(/&nbsp;/g, ' ')
7-
.replace(/&lt;/g, '<')
8-
.replace(/&gt;/g, '>')
9-
.replace(/&amp;/g, '&')
10-
.trim();
11-
}
3+
import { stripHtml } from './text.js';
124

135
function answerIdFromUrl(url) {
146
if (typeof url !== 'string') return '';

clis/zhihu/question.test.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ describe('zhihu question', () => {
2020
id: '2036567240334653053',
2121
author: { name: 'alice' },
2222
voteup_count: 12,
23-
content: 'Hello Zhihu',
23+
content: '<p>&#34;Hello&#34; &#x26; Zhihu</p>',
2424
},
2525
],
2626
};
@@ -33,7 +33,7 @@ describe('zhihu question', () => {
3333
author: 'alice',
3434
votes: 12,
3535
url: 'https://www.zhihu.com/question/2021881398772981878/answer/2036567240334653053',
36-
content: 'Hello Zhihu',
36+
content: '"Hello" & Zhihu',
3737
},
3838
]);
3939
expect(goto).toHaveBeenCalledWith('https://www.zhihu.com/question/2021881398772981878');

clis/zhihu/search.js

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,6 @@
11
import { cli, Strategy } from '@jackwener/opencli/registry';
22
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
3-
4-
function stripHtml(html) {
5-
return (html || '')
6-
.replace(/<[^>]+>/g, '')
7-
.replace(/&nbsp;/g, ' ')
8-
.replace(/&lt;/g, '<')
9-
.replace(/&gt;/g, '>')
10-
.replace(/&amp;/g, '&')
11-
.replace(/<em>/g, '')
12-
.replace(/<\/em>/g, '')
13-
.trim();
14-
}
3+
import { stripHtml } from './text.js';
154

165
function itemKey(item) {
176
const obj = item.object || {};

clis/zhihu/search.test.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ describe('zhihu search', () => {
3636
type: 'answer',
3737
author: { name: 'alice' },
3838
voteup_count: 12,
39-
question: { id: 'q1', name: '<em>Codex</em> question' },
39+
question: { id: 'q1', name: '<em>Codex</em> &#34;question&#34;' },
4040
},
4141
},
4242
{
@@ -57,7 +57,7 @@ describe('zhihu search', () => {
5757
await expect(cmd.func(page, { query: 'codex', limit: 2 })).resolves.toEqual([
5858
{
5959
rank: 1,
60-
title: 'Codex question',
60+
title: 'Codex "question"',
6161
type: 'answer',
6262
author: 'alice',
6363
votes: 12,

clis/zhihu/text.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
function decodeEntity(codePoint) {
2+
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
3+
? String.fromCodePoint(codePoint)
4+
: null;
5+
}
6+
7+
export function stripHtml(html, { preserveBlocks = false } = {}) {
8+
if (!html) return '';
9+
let text = String(html);
10+
if (preserveBlocks) {
11+
text = text
12+
.replace(/<br\s*\/?\s*>/gi, '\n')
13+
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n');
14+
}
15+
return text
16+
.replace(/<[^>]+>/g, '')
17+
.replace(/&nbsp;/g, ' ')
18+
.replace(/&lt;/g, '<')
19+
.replace(/&gt;/g, '>')
20+
.replace(/&amp;/g, '&')
21+
.replace(/&quot;/g, '"')
22+
.replace(/&#39;/g, "'")
23+
.replace(/&#(\d+);/g, (entity, value) => decodeEntity(Number(value)) ?? entity)
24+
.replace(/&#x([0-9a-f]+);/gi, (entity, value) => decodeEntity(Number.parseInt(value, 16)) ?? entity)
25+
.replace(/\n{3,}/g, '\n\n')
26+
.trim();
27+
}
28+
29+
export const __test__ = { decodeEntity };

clis/zhihu/text.test.js

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import { describe, expect, it } from 'vitest';
2+
import { stripHtml } from './text.js';
3+
4+
describe('zhihu text helpers', () => {
5+
it('strips tags and decodes named entities in flat mode', () => {
6+
expect(stripHtml('<em>Codex</em>&nbsp;&amp;&nbsp;&lt;CLI&gt;')).toBe('Codex & <CLI>');
7+
});
8+
9+
it('decodes decimal and hexadecimal numeric entities', () => {
10+
expect(stripHtml('&#34;中文&#34; &#x26; &#39;test&#39;')).toBe('"中文" & \'test\'');
11+
});
12+
13+
it('keeps invalid numeric entities unchanged', () => {
14+
expect(stripHtml('bad &#9999999999; entity')).toBe('bad &#9999999999; entity');
15+
});
16+
17+
it('keeps list excerpts flat by default', () => {
18+
expect(stripHtml('<p>first</p><br><p>second</p>')).toBe('firstsecond');
19+
});
20+
21+
it('preserves block breaks when requested', () => {
22+
expect(stripHtml('<p>first</p><br><p>second</p>', { preserveBlocks: true })).toBe('first\n\nsecond');
23+
});
24+
});

0 commit comments

Comments
 (0)