Skip to content

Commit 0abf77c

Browse files
authored
Improve index.md quality with DOM preprocessing pipeline (#4125)
Add preprocessing transforms to generate-markdown.mjs that clean the Antora HTML before dom-to-semantic-markdown conversion: - Strip <style>, <script>, and signup promo elements - Convert admonition blocks to blockquote format - Extract live demo JS code, drop scaffold noise - Remove Antora heading anchor wrappers - Convert card-layout tables to bulleted lists - Fix about:blank# anchor references Refactored to const arrow functions with a composable transform pipeline, following tinymce/tinymce-premium conventions. Verified: byte-identical output across all 1,442 pages; zero about:blank, <style>, signup-promo, or kapa-widget leaks.
1 parent b41eb9d commit 0abf77c

1 file changed

Lines changed: 220 additions & 69 deletions

File tree

scripts/generate-markdown.mjs

Lines changed: 220 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
* Post-build script: converts every Antora HTML page into a Markdown sibling
55
* so AI agents can fetch clean, low-token content via content negotiation.
66
*
7-
* Uses dom-to-semantic-markdown (d2m) for conversion (preserves links in tables).
8-
*
97
* Usage: node scripts/generate-markdown.mjs [buildDir]
108
* Default buildDir = build/site
119
*/
@@ -16,97 +14,250 @@ import { JSDOM } from 'jsdom';
1614
import { convertHtmlToMarkdown } from 'dom-to-semantic-markdown';
1715
import { encode } from 'gpt-tokenizer';
1816

17+
// ---------------------------------------------------------------------------
18+
// Constants
19+
// ---------------------------------------------------------------------------
20+
1921
const BUILD_DIR = process.argv[2] || 'build/site';
2022

23+
const ADMONITION_TYPES = [ 'note', 'warning', 'tip', 'important', 'caution' ];
24+
25+
const HEADING_ANCHOR_SELECTOR = [
26+
'h2', 'h3', 'h4', 'h5', 'h6'
27+
].map((h) => `${h} > a.anchor`).join(', ');
28+
29+
// ---------------------------------------------------------------------------
30+
// DOM helpers — small, pure-ish transforms operating on a single element
31+
// ---------------------------------------------------------------------------
32+
33+
const removeAll = (root, selector) =>
34+
root.querySelectorAll(selector).forEach((el) => el.remove());
35+
36+
const capitalize = (s) =>
37+
s.charAt(0).toUpperCase() + s.slice(1);
38+
39+
const resolveAdmonitionType = (el) =>
40+
ADMONITION_TYPES.find((t) => el.classList.contains(t)) ?? 'note';
41+
42+
const isCardLayoutRow = (tr) => {
43+
const td = tr.querySelector('td');
44+
return td !== null && td.querySelector('.lead, a.xref') !== null;
45+
};
46+
47+
const isCardLayoutTable = (table) =>
48+
!table.querySelector('thead') &&
49+
[ ...table.querySelectorAll('tbody tr') ].every(isCardLayoutRow);
50+
2151
// ---------------------------------------------------------------------------
22-
// Helpers
52+
// DOM transforms — each receives (article, document) and mutates in-place
2353
// ---------------------------------------------------------------------------
2454

25-
async function* walkHtml(dir) {
26-
for (const entry of await readdir(dir, { withFileTypes: true })) {
27-
const full = join(dir, entry.name);
28-
if (entry.isDirectory()) {
29-
yield* walkHtml(full);
30-
} else if (entry.name.endsWith('.html')) {
31-
yield full;
55+
const stripNonContent = (article) => {
56+
removeAll(article, 'style, script, .signup-promo');
57+
};
58+
59+
const rewriteAdmonitions = (article, doc) => {
60+
article.querySelectorAll('.admonitionblock').forEach((adm) => {
61+
const content = adm.querySelector('td.content');
62+
if (!content) return;
63+
64+
const bq = doc.createElement('blockquote');
65+
const label = doc.createElement('strong');
66+
label.textContent = capitalize(resolveAdmonitionType(adm)) + ': ';
67+
bq.appendChild(label);
68+
69+
while (content.firstChild) {
70+
bq.appendChild(content.firstChild);
3271
}
72+
73+
adm.replaceWith(bq);
74+
});
75+
};
76+
77+
const extractDemoCode = (demo, doc) => {
78+
const jsPane = demo.querySelector('[id*="_pane_js_"]');
79+
const code = jsPane?.querySelector('code');
80+
if (!code) return null;
81+
82+
const fragment = doc.createDocumentFragment();
83+
const heading = doc.createElement('p');
84+
heading.innerHTML = '<strong>Example</strong>';
85+
fragment.appendChild(heading);
86+
87+
const pre = doc.createElement('pre');
88+
const codeEl = doc.createElement('code');
89+
codeEl.className = 'language-js';
90+
codeEl.textContent = code.textContent;
91+
pre.appendChild(codeEl);
92+
fragment.appendChild(pre);
93+
94+
return fragment;
95+
};
96+
97+
const rewriteLiveDemos = (article, doc) => {
98+
article.querySelectorAll('.live-demo').forEach((demo) => {
99+
const replacement = extractDemoCode(demo, doc);
100+
replacement ? demo.replaceWith(replacement) : demo.remove();
101+
});
102+
};
103+
104+
const stripHeadingAnchors = (article) => {
105+
removeAll(article, HEADING_ANCHOR_SELECTOR);
106+
};
107+
108+
const buildListItem = (td, doc) => {
109+
const link = td.querySelector('.lead a');
110+
if (!link) return null;
111+
112+
const desc = td.querySelector('.lead ~ .paragraph');
113+
const li = doc.createElement('li');
114+
const strong = doc.createElement('strong');
115+
strong.appendChild(link.cloneNode(true));
116+
li.appendChild(strong);
117+
118+
if (desc) {
119+
li.appendChild(doc.createTextNode(' \u2014 ' + desc.textContent.trim()));
33120
}
34-
}
35-
36-
function extractTitle(doc) {
37-
const h1 = doc.querySelector('article.doc h1');
38-
if (h1) return h1.textContent.trim();
39-
const title = doc.querySelector('title');
40-
if (title) return title.textContent.trim().replace(/ \|.*$/, '');
41-
return 'Untitled';
42-
}
43-
44-
function buildFrontmatter(title, tokens) {
45-
return [
46-
'---',
47-
`title: "${title.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`,
48-
`tokens: ${tokens}`,
49-
'---',
50-
'',
51-
].join('\n');
52-
}
53121

54-
/**
55-
* Convert article.doc HTML to Markdown using dom-to-semantic-markdown.
56-
*/
57-
function convertToMarkdown(articleHtml, dom) {
58-
return convertHtmlToMarkdown(articleHtml, {
59-
overrideDOMParser: new dom.window.DOMParser(),
60-
extractMainContent: false, // we already extracted article.doc
61-
enableTableColumnTracking: false,
62-
refifyUrls: false,
63-
websiteDomain: 'https://www.tiny.cloud',
122+
return li;
123+
};
124+
125+
const rewriteCardTables = (article, doc) => {
126+
article.querySelectorAll('table.tableblock').forEach((table) => {
127+
if (!isCardLayoutTable(table)) return;
128+
129+
const items = [ ...table.querySelectorAll('tbody td') ]
130+
.map((td) => buildListItem(td, doc))
131+
.filter(Boolean);
132+
133+
if (items.length === 0) return;
134+
135+
const ul = doc.createElement('ul');
136+
items.forEach((li) => ul.appendChild(li));
137+
table.replaceWith(ul);
64138
});
65-
}
139+
};
66140

67141
// ---------------------------------------------------------------------------
68-
// Main
142+
// Preprocessing pipeline
69143
// ---------------------------------------------------------------------------
70144

71-
async function main() {
72-
const manifest = {};
73-
let converted = 0;
74-
let skipped = 0;
145+
const TRANSFORMS = [
146+
stripNonContent,
147+
rewriteAdmonitions,
148+
rewriteLiveDemos,
149+
stripHeadingAnchors,
150+
rewriteCardTables,
151+
];
75152

76-
console.log(`Generating markdown siblings in ${BUILD_DIR} …`);
153+
const preprocess = (articleEl, doc) => {
154+
const article = articleEl.cloneNode(true);
155+
TRANSFORMS.forEach((fn) => fn(article, doc));
156+
return article;
157+
};
77158

78-
for await (const htmlPath of walkHtml(BUILD_DIR)) {
79-
const html = await readFile(htmlPath, 'utf-8');
80-
const dom = new JSDOM(html);
81-
const article = dom.window.document.querySelector('article.doc');
159+
// ---------------------------------------------------------------------------
160+
// Conversion
161+
// ---------------------------------------------------------------------------
82162

83-
if (!article) {
84-
skipped++;
85-
continue;
86-
}
163+
const D2M_OPTIONS = (dom) => ({
164+
overrideDOMParser: new dom.window.DOMParser(),
165+
extractMainContent: false,
166+
enableTableColumnTracking: false,
167+
refifyUrls: false,
168+
websiteDomain: 'https://www.tiny.cloud',
169+
});
170+
171+
const fixBlankAnchors = (md) =>
172+
md.replace(/about:blank#/g, '#');
173+
174+
const toMarkdown = (articleEl, dom) => {
175+
const article = preprocess(articleEl, dom.window.document);
176+
const raw = convertHtmlToMarkdown(article.innerHTML, D2M_OPTIONS(dom));
177+
return fixBlankAnchors(raw);
178+
};
179+
180+
// ---------------------------------------------------------------------------
181+
// Frontmatter
182+
// ---------------------------------------------------------------------------
183+
184+
const escapeYaml = (s) =>
185+
s.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
186+
187+
const buildFrontmatter = (title, tokens) =>
188+
`---\ntitle: "${escapeYaml(title)}"\ntokens: ${tokens}\n---\n`;
87189

88-
const title = extractTitle(dom.window.document);
89-
const markdown = convertToMarkdown(article.innerHTML, dom);
90-
const tokens = encode(markdown).length;
91-
const frontmatter = buildFrontmatter(title, tokens);
92-
const fullMd = frontmatter + markdown + '\n';
190+
// ---------------------------------------------------------------------------
191+
// Title extraction
192+
// ---------------------------------------------------------------------------
93193

94-
const mdPath = htmlPath.replace(/\.html$/, '.md');
95-
await mkdir(dirname(mdPath), { recursive: true });
96-
await writeFile(mdPath, fullMd, 'utf-8');
194+
const extractTitle = (doc) =>
195+
doc.querySelector('article.doc h1')?.textContent?.trim()
196+
?? doc.querySelector('title')?.textContent?.trim()?.replace(/ \|.*$/, '')
197+
?? 'Untitled';
97198

98-
const urlPath = '/' + relative(BUILD_DIR, dirname(htmlPath)) + '/';
99-
manifest[urlPath] = tokens;
100-
converted++;
199+
// ---------------------------------------------------------------------------
200+
// File walking
201+
// ---------------------------------------------------------------------------
202+
203+
const collectHtmlFiles = async (dir) => {
204+
const entries = await readdir(dir, { withFileTypes: true });
205+
const nested = await Promise.all(
206+
entries.map((entry) => {
207+
const full = join(dir, entry.name);
208+
return entry.isDirectory()
209+
? collectHtmlFiles(full)
210+
: entry.name.endsWith('.html') ? [ full ] : [];
211+
})
212+
);
213+
return nested.flat();
214+
};
215+
216+
// ---------------------------------------------------------------------------
217+
// Single-page conversion
218+
// ---------------------------------------------------------------------------
219+
220+
const convertPage = async (htmlPath) => {
221+
const html = await readFile(htmlPath, 'utf-8');
222+
const dom = new JSDOM(html);
223+
const articleEl = dom.window.document.querySelector('article.doc');
224+
225+
if (!articleEl) return null;
226+
227+
const title = extractTitle(dom.window.document);
228+
const markdown = toMarkdown(articleEl, dom);
229+
const tokens = encode(markdown).length;
230+
const content = buildFrontmatter(title, tokens) + markdown + '\n';
231+
const mdPath = htmlPath.replace(/\.html$/, '.md');
232+
233+
await mkdir(dirname(mdPath), { recursive: true });
234+
await writeFile(mdPath, content, 'utf-8');
235+
236+
return { path: '/' + relative(BUILD_DIR, dirname(htmlPath)) + '/', tokens };
237+
};
238+
239+
// ---------------------------------------------------------------------------
240+
// Main
241+
// ---------------------------------------------------------------------------
242+
243+
const main = async () => {
244+
console.log(`Generating markdown siblings in ${BUILD_DIR} …`);
245+
246+
const htmlFiles = await collectHtmlFiles(BUILD_DIR);
247+
const pages = [];
248+
249+
for (const htmlPath of htmlFiles) {
250+
const result = await convertPage(htmlPath);
251+
if (result) pages.push(result);
101252
}
102253

254+
const manifest = Object.fromEntries(pages.map(({ path, tokens }) => [ path, tokens ]));
103255
const manifestPath = join(BUILD_DIR, '_markdown-manifest.json');
104256
await writeFile(manifestPath, JSON.stringify(manifest, null, 2) + '\n', 'utf-8');
105257

106-
console.log(
107-
`Done. ${converted} pages converted, ${skipped} skipped (no article.doc). Manifest → ${manifestPath}`
108-
);
109-
}
258+
const skipped = htmlFiles.length - pages.length;
259+
console.log(`Done. ${pages.length} pages converted, ${skipped} skipped (no article.doc). Manifest → ${manifestPath}`);
260+
};
110261

111262
main().catch((err) => {
112263
console.error(err);

0 commit comments

Comments
 (0)