Skip to content

Commit d170b55

Browse files
committed
Fix HTML sanitization in SEO plugin with proper parser
Replace regex-based HTML tag stripping with htmlparser2 for extracting plain-text descriptions from rendered pages.
1 parent e2077ce commit d170b55

1 file changed

Lines changed: 7 additions & 5 deletions

File tree

scripts/typedoc-plugin-seo.mjs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { Renderer } from "typedoc";
1010
import fs from "node:fs";
1111
import path from "node:path";
1212
import zlib from "node:zlib";
13+
import * as htmlparser2 from "htmlparser2";
1314

1415
const SITE_NAME = "MCP Apps";
1516

@@ -30,7 +31,7 @@ function toSlug(filename) {
3031

3132
/**
3233
* Extract a plain-text description from the rendered HTML body.
33-
* Takes the first meaningful paragraph text, strips tags, and truncates.
34+
* Parses the first content paragraph using htmlparser2 and truncates.
3435
* @param {string} html
3536
* @returns {string}
3637
*/
@@ -48,10 +49,11 @@ function extractDescription(html) {
4849
if (!paragraphs) return "";
4950

5051
for (const p of paragraphs) {
51-
const text = p
52-
.replace(/<[^>]+>/g, "") // strip HTML tags
53-
.replace(/&[a-z]+;/g, " ") // strip HTML entities
54-
.replace(/\s+/g, " ") // collapse whitespace
52+
const text = htmlparser2
53+
.parseDocument(p)
54+
.children.map((node) => htmlparser2.DomUtils.textContent(node))
55+
.join("")
56+
.replace(/\s+/g, " ")
5557
.trim();
5658

5759
// Skip very short or code-heavy paragraphs

0 commit comments

Comments
 (0)