Skip to content

Commit 418a102

Browse files
authored
Improve markdown conversion quality (#1229)
Refactor HTML-to-Markdown converter to eliminate navigation chrome and noise artifacts, improving content quality for AI consumption.
1 parent c87015d commit 418a102

1 file changed

Lines changed: 135 additions & 23 deletions

File tree

netlify/edge-functions/markdown-negotiation.ts

Lines changed: 135 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,34 @@
11
const MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8";
22
const TEXT_NODE = 3;
33
const ELEMENT_NODE = 1;
4+
const NOISE_SELECTORS = [
5+
"script",
6+
"style",
7+
"noscript",
8+
"template",
9+
"svg",
10+
"header",
11+
"footer",
12+
"nav",
13+
"aside",
14+
".site-header",
15+
".site-footer",
16+
".side-bar",
17+
".secondary-nav",
18+
".search-input-wrap",
19+
".social-media-list",
20+
".footer-nav",
21+
".skip-to-content-link",
22+
".header-anchor",
23+
".next-previous",
24+
".next-previous--feedback",
25+
".home-banner-info",
26+
"#secondary-nav",
27+
"#site-nav",
28+
"#toggled-search",
29+
"[aria-label='Main navigation']",
30+
"[aria-label='Footer navigation']",
31+
];
432

533
export default async (request: Request, context: { next: () => Promise<Response> }) => {
634
const response = await context.next();
@@ -76,18 +104,15 @@ function htmlToMarkdown(html: string, baseUrl: URL): string {
76104
return htmlToMarkdownFallback(html, baseUrl);
77105
}
78106

79-
for (const selector of ["script", "style", "noscript"]) {
80-
doc.querySelectorAll(selector).forEach((node: any) => node.remove());
81-
}
107+
pruneDocument(doc);
82108

83109
const title = normalizeWhitespace(doc.querySelector("title")?.textContent || "");
84-
const body = doc.body;
85-
const bodyMarkdown = body ? renderChildren(body, baseUrl) : "";
110+
const contentRoot = selectContentRoot(doc);
111+
const bodyMarkdown = contentRoot ? renderChildren(contentRoot, baseUrl) : "";
86112

87113
const parts: string[] = [];
88114
if (title) {
89115
parts.push(`---\ntitle: ${title}\n---`);
90-
parts.push(`# ${title}`);
91116
}
92117

93118
if (bodyMarkdown) {
@@ -102,19 +127,20 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string {
102127
.replace(/<!--[\s\S]*?-->/g, "")
103128
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, "")
104129
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, "")
105-
.replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, "");
130+
.replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, "")
131+
.replace(/<svg\b[^>]*>[\s\S]*?<\/svg>/gi, "");
106132

107133
const titleMatch = sanitized.match(/<title\b[^>]*>([\s\S]*?)<\/title>/i);
108134
const title = titleMatch ? normalizeWhitespace(decodeHtmlEntities(stripTags(titleMatch[1]))) : "";
109135

110136
const bodyMatch = sanitized.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
111-
const bodyHtml = bodyMatch ? bodyMatch[1] : sanitized;
137+
const rawBodyHtml = bodyMatch ? bodyMatch[1] : sanitized;
138+
const bodyHtml = extractPrimaryHtmlFragment(rawBodyHtml);
112139
const bodyMarkdown = renderHtmlFragmentToMarkdown(bodyHtml, baseUrl);
113140

114141
const parts: string[] = [];
115142
if (title) {
116143
parts.push(`---\ntitle: ${title}\n---`);
117-
parts.push(`# ${title}`);
118144
}
119145

120146
if (bodyMarkdown) {
@@ -126,7 +152,7 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string {
126152

127153
function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
128154
const protectedBlocks: string[] = [];
129-
let content = html;
155+
let content = stripFallbackNoise(html);
130156

131157
// Protect code blocks so later tag cleanup does not alter their content.
132158
content = content.replace(/<pre\b[^>]*>([\s\S]*?)<\/pre>/gi, (_match, preContent: string) => {
@@ -137,18 +163,10 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
137163
});
138164

139165
content = content.replace(/<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi, (_match, level: string, text: string) => {
140-
const clean = decodeHtmlEntities(stripTags(text)).trim();
166+
const clean = normalizeWhitespace(decodeHtmlEntities(stripTags(text)));
141167
return clean ? `\n\n${"#".repeat(Number(level))} ${clean}\n\n` : "";
142168
});
143169

144-
content = content.replace(/<a\b([^>]*)>([\s\S]*?)<\/a>/gi, (_match, attrs: string, text: string) => {
145-
const hrefMatch = attrs.match(/href\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i);
146-
const rawHref = hrefMatch ? hrefMatch[1] || hrefMatch[2] || hrefMatch[3] || "" : "";
147-
const href = toAbsoluteUrl(rawHref, baseUrl);
148-
const label = normalizeWhitespace(decodeHtmlEntities(stripTags(text))) || href;
149-
return href ? `[${label}](${href})` : label;
150-
});
151-
152170
content = content.replace(/<img\b([^>]*)>/gi, (_match, attrs: string) => {
153171
const altMatch = attrs.match(/alt\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i);
154172
const srcMatch = attrs.match(/src\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i);
@@ -159,6 +177,18 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
159177
return src ? `![${alt}](${src})` : "";
160178
});
161179

180+
content = content.replace(/<a\b([^>]*)>([\s\S]*?)<\/a>/gi, (_match, attrs: string, text: string) => {
181+
const hrefMatch = attrs.match(/href\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i);
182+
const rawHref = hrefMatch ? hrefMatch[1] || hrefMatch[2] || hrefMatch[3] || "" : "";
183+
const href = toAbsoluteUrl(rawHref, baseUrl);
184+
const label = normalizeWhitespace(decodeHtmlEntities(stripTags(text)));
185+
if (!href || !label) {
186+
return label;
187+
}
188+
189+
return `[${label}](${href})`;
190+
});
191+
162192
content = content
163193
.replace(/<(strong|b)\b[^>]*>([\s\S]*?)<\/\1>/gi, (_m, _tag: string, text: string) => {
164194
const clean = normalizeWhitespace(decodeHtmlEntities(stripTags(text)));
@@ -185,7 +215,7 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
185215
content = decodeHtmlEntities(stripTags(content))
186216
.replace(/\n{3,}/g, "\n\n")
187217
.split("\n")
188-
.map((line) => line.trimEnd())
218+
.map((line) => line.trim())
189219
.join("\n")
190220
.trim();
191221

@@ -244,8 +274,12 @@ function renderNode(node: any, baseUrl: URL, listDepth: number): string {
244274
}
245275

246276
if (tag === "a") {
247-
const text = renderInlineChildren(el, baseUrl) || normalizeWhitespace(el.getAttribute("href") || "");
277+
const text = renderInlineChildren(el, baseUrl);
248278
const href = toAbsoluteUrl(el.getAttribute("href"), baseUrl);
279+
if (!text) {
280+
return "";
281+
}
282+
249283
return href ? `[${text}](${href})` : text;
250284
}
251285

@@ -349,9 +383,20 @@ function renderInlineChildren(parent: any, baseUrl: URL): string {
349383
const tag = el.tagName.toLowerCase();
350384

351385
if (tag === "a") {
352-
const text = renderInlineChildren(el, baseUrl) || normalizeWhitespace(el.getAttribute("href") || "");
386+
const text = renderInlineChildren(el, baseUrl);
353387
const href = toAbsoluteUrl(el.getAttribute("href"), baseUrl);
354-
out.push(href ? `[${text}](${href})` : text);
388+
if (text) {
389+
out.push(href ? `[${text}](${href})` : text);
390+
}
391+
continue;
392+
}
393+
394+
if (tag === "img") {
395+
const alt = normalizeWhitespace(el.getAttribute("alt") || "image");
396+
const src = toAbsoluteUrl(el.getAttribute("src"), baseUrl);
397+
if (src) {
398+
out.push(`![${alt}](${src})`);
399+
}
355400
continue;
356401
}
357402

@@ -408,3 +453,70 @@ function toAbsoluteUrl(href: string | null, baseUrl: URL): string {
408453
function normalizeWhitespace(value: string): string {
409454
return value.replace(/\s+/g, " ").trim();
410455
}
456+
457+
function pruneDocument(doc: { querySelectorAll: (selector: string) => any }): void {
458+
for (const selector of NOISE_SELECTORS) {
459+
doc.querySelectorAll(selector).forEach((node: any) => node.remove());
460+
}
461+
}
462+
463+
function selectContentRoot(doc: { querySelector: (selector: string) => any; body?: any }): any {
464+
const main = doc.querySelector("main");
465+
if (main && /\bhome-page\b/.test(main.getAttribute?.("class") || "")) {
466+
return main;
467+
}
468+
469+
const preferredSelectors = [
470+
"#main article",
471+
"main article",
472+
"article.guide",
473+
"article",
474+
"#main",
475+
"[role='main']",
476+
"main",
477+
"body",
478+
];
479+
480+
for (const selector of preferredSelectors) {
481+
const found = doc.querySelector(selector);
482+
if (found) {
483+
return found;
484+
}
485+
}
486+
487+
return doc.body || null;
488+
}
489+
490+
function extractPrimaryHtmlFragment(html: string): string {
491+
const mainMatch = html.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
492+
const mainHtml = mainMatch ? mainMatch[1] : "";
493+
const isHome = /<main\b[^>]*class=("|')[^"']*\bhome-page\b[^"']*\1/i.test(html);
494+
495+
if (isHome && mainHtml) {
496+
return mainHtml;
497+
}
498+
499+
const articleMatch = html.match(/<article\b[^>]*>([\s\S]*?)<\/article>/i);
500+
if (articleMatch) {
501+
return articleMatch[1];
502+
}
503+
504+
if (mainHtml) {
505+
return mainHtml;
506+
}
507+
508+
return html;
509+
}
510+
511+
function stripFallbackNoise(html: string): string {
512+
return html
513+
.replace(/<header\b[^>]*>[\s\S]*?<\/header>/gi, "")
514+
.replace(/<footer\b[^>]*>[\s\S]*?<\/footer>/gi, "")
515+
.replace(/<nav\b[^>]*>[\s\S]*?<\/nav>/gi, "")
516+
.replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "")
517+
.replace(/<div\b[^>]*class=("|')[^"']*\bnext-previous\b[^"']*\1[^>]*>[\s\S]*?<\/div>/gi, "")
518+
.replace(/<section\b[^>]*class=("|')[^"']*\bhome-banner-info\b[^"']*\1[^>]*>[\s\S]*?<\/section>/gi, "")
519+
.replace(/<div\b[^>]*class=("|')[^"']*\bside-bar\b[^"']*\1[^>]*>[\s\S]*?<\/div>/gi, "")
520+
.replace(/<a\b[^>]*class=("|')[^"']*\bskip-to-content-link\b[^"']*\1[^>]*>[\s\S]*?<\/a>/gi, "")
521+
.replace(/<a\b[^>]*class=("|')[^"']*\bheader-anchor\b[^"']*\1[^>]*>[\s\S]*?<\/a>/gi, "");
522+
}

0 commit comments

Comments
 (0)