44 * Post-build script: converts every Antora HTML page into a Markdown sibling
55 * so AI agents can fetch clean, low-token content via content negotiation.
66 *
7- * Uses dom-to-semantic-markdown (d2m) for conversion (preserves links in tables).
8- *
97 * Usage: node scripts/generate-markdown.mjs [buildDir]
108 * Default buildDir = build/site
119 */
@@ -16,97 +14,250 @@ import { JSDOM } from 'jsdom';
1614import { convertHtmlToMarkdown } from 'dom-to-semantic-markdown' ;
1715import { encode } from 'gpt-tokenizer' ;
1816
17+ // ---------------------------------------------------------------------------
18+ // Constants
19+ // ---------------------------------------------------------------------------
20+
1921const BUILD_DIR = process . argv [ 2 ] || 'build/site' ;
2022
23+ const ADMONITION_TYPES = [ 'note' , 'warning' , 'tip' , 'important' , 'caution' ] ;
24+
25+ const HEADING_ANCHOR_SELECTOR = [
26+ 'h2' , 'h3' , 'h4' , 'h5' , 'h6'
27+ ] . map ( ( h ) => `${ h } > a.anchor` ) . join ( ', ' ) ;
28+
29+ // ---------------------------------------------------------------------------
30+ // DOM helpers — small, pure-ish transforms operating on a single element
31+ // ---------------------------------------------------------------------------
32+
33+ const removeAll = ( root , selector ) =>
34+ root . querySelectorAll ( selector ) . forEach ( ( el ) => el . remove ( ) ) ;
35+
36+ const capitalize = ( s ) =>
37+ s . charAt ( 0 ) . toUpperCase ( ) + s . slice ( 1 ) ;
38+
39+ const resolveAdmonitionType = ( el ) =>
40+ ADMONITION_TYPES . find ( ( t ) => el . classList . contains ( t ) ) ?? 'note' ;
41+
42+ const isCardLayoutRow = ( tr ) => {
43+ const td = tr . querySelector ( 'td' ) ;
44+ return td !== null && td . querySelector ( '.lead, a.xref' ) !== null ;
45+ } ;
46+
47+ const isCardLayoutTable = ( table ) =>
48+ ! table . querySelector ( 'thead' ) &&
49+ [ ...table . querySelectorAll ( 'tbody tr' ) ] . every ( isCardLayoutRow ) ;
50+
2151// ---------------------------------------------------------------------------
22- // Helpers
52+ // DOM transforms — each receives (article, document) and mutates in-place
2353// ---------------------------------------------------------------------------
2454
25- async function * walkHtml ( dir ) {
26- for ( const entry of await readdir ( dir , { withFileTypes : true } ) ) {
27- const full = join ( dir , entry . name ) ;
28- if ( entry . isDirectory ( ) ) {
29- yield * walkHtml ( full ) ;
30- } else if ( entry . name . endsWith ( '.html' ) ) {
31- yield full ;
55+ const stripNonContent = ( article ) => {
56+ removeAll ( article , 'style, script, .signup-promo' ) ;
57+ } ;
58+
59+ const rewriteAdmonitions = ( article , doc ) => {
60+ article . querySelectorAll ( '.admonitionblock' ) . forEach ( ( adm ) => {
61+ const content = adm . querySelector ( 'td.content' ) ;
62+ if ( ! content ) return ;
63+
64+ const bq = doc . createElement ( 'blockquote' ) ;
65+ const label = doc . createElement ( 'strong' ) ;
66+ label . textContent = capitalize ( resolveAdmonitionType ( adm ) ) + ': ' ;
67+ bq . appendChild ( label ) ;
68+
69+ while ( content . firstChild ) {
70+ bq . appendChild ( content . firstChild ) ;
3271 }
72+
73+ adm . replaceWith ( bq ) ;
74+ } ) ;
75+ } ;
76+
77+ const extractDemoCode = ( demo , doc ) => {
78+ const jsPane = demo . querySelector ( '[id*="_pane_js_"]' ) ;
79+ const code = jsPane ?. querySelector ( 'code' ) ;
80+ if ( ! code ) return null ;
81+
82+ const fragment = doc . createDocumentFragment ( ) ;
83+ const heading = doc . createElement ( 'p' ) ;
84+ heading . innerHTML = '<strong>Example</strong>' ;
85+ fragment . appendChild ( heading ) ;
86+
87+ const pre = doc . createElement ( 'pre' ) ;
88+ const codeEl = doc . createElement ( 'code' ) ;
89+ codeEl . className = 'language-js' ;
90+ codeEl . textContent = code . textContent ;
91+ pre . appendChild ( codeEl ) ;
92+ fragment . appendChild ( pre ) ;
93+
94+ return fragment ;
95+ } ;
96+
97+ const rewriteLiveDemos = ( article , doc ) => {
98+ article . querySelectorAll ( '.live-demo' ) . forEach ( ( demo ) => {
99+ const replacement = extractDemoCode ( demo , doc ) ;
100+ replacement ? demo . replaceWith ( replacement ) : demo . remove ( ) ;
101+ } ) ;
102+ } ;
103+
104+ const stripHeadingAnchors = ( article ) => {
105+ removeAll ( article , HEADING_ANCHOR_SELECTOR ) ;
106+ } ;
107+
108+ const buildListItem = ( td , doc ) => {
109+ const link = td . querySelector ( '.lead a' ) ;
110+ if ( ! link ) return null ;
111+
112+ const desc = td . querySelector ( '.lead ~ .paragraph' ) ;
113+ const li = doc . createElement ( 'li' ) ;
114+ const strong = doc . createElement ( 'strong' ) ;
115+ strong . appendChild ( link . cloneNode ( true ) ) ;
116+ li . appendChild ( strong ) ;
117+
118+ if ( desc ) {
119+ li . appendChild ( doc . createTextNode ( ' \u2014 ' + desc . textContent . trim ( ) ) ) ;
33120 }
34- }
35-
36- function extractTitle ( doc ) {
37- const h1 = doc . querySelector ( 'article.doc h1' ) ;
38- if ( h1 ) return h1 . textContent . trim ( ) ;
39- const title = doc . querySelector ( 'title' ) ;
40- if ( title ) return title . textContent . trim ( ) . replace ( / \| .* $ / , '' ) ;
41- return 'Untitled' ;
42- }
43-
44- function buildFrontmatter ( title , tokens ) {
45- return [
46- '---' ,
47- `title: "${ title . replace ( / \\ / g, '\\\\' ) . replace ( / " / g, '\\"' ) } "` ,
48- `tokens: ${ tokens } ` ,
49- '---' ,
50- '' ,
51- ] . join ( '\n' ) ;
52- }
53121
54- /**
55- * Convert article.doc HTML to Markdown using dom-to-semantic-markdown.
56- */
57- function convertToMarkdown ( articleHtml , dom ) {
58- return convertHtmlToMarkdown ( articleHtml , {
59- overrideDOMParser : new dom . window . DOMParser ( ) ,
60- extractMainContent : false , // we already extracted article.doc
61- enableTableColumnTracking : false ,
62- refifyUrls : false ,
63- websiteDomain : 'https://www.tiny.cloud' ,
122+ return li ;
123+ } ;
124+
125+ const rewriteCardTables = ( article , doc ) => {
126+ article . querySelectorAll ( 'table.tableblock' ) . forEach ( ( table ) => {
127+ if ( ! isCardLayoutTable ( table ) ) return ;
128+
129+ const items = [ ...table . querySelectorAll ( 'tbody td' ) ]
130+ . map ( ( td ) => buildListItem ( td , doc ) )
131+ . filter ( Boolean ) ;
132+
133+ if ( items . length === 0 ) return ;
134+
135+ const ul = doc . createElement ( 'ul' ) ;
136+ items . forEach ( ( li ) => ul . appendChild ( li ) ) ;
137+ table . replaceWith ( ul ) ;
64138 } ) ;
65- }
139+ } ;
66140
67141// ---------------------------------------------------------------------------
68- // Main
142+ // Preprocessing pipeline
69143// ---------------------------------------------------------------------------
70144
71- async function main ( ) {
72- const manifest = { } ;
73- let converted = 0 ;
74- let skipped = 0 ;
145+ const TRANSFORMS = [
146+ stripNonContent ,
147+ rewriteAdmonitions ,
148+ rewriteLiveDemos ,
149+ stripHeadingAnchors ,
150+ rewriteCardTables ,
151+ ] ;
75152
76- console . log ( `Generating markdown siblings in ${ BUILD_DIR } …` ) ;
153+ const preprocess = ( articleEl , doc ) => {
154+ const article = articleEl . cloneNode ( true ) ;
155+ TRANSFORMS . forEach ( ( fn ) => fn ( article , doc ) ) ;
156+ return article ;
157+ } ;
77158
78- for await ( const htmlPath of walkHtml ( BUILD_DIR ) ) {
79- const html = await readFile ( htmlPath , 'utf-8' ) ;
80- const dom = new JSDOM ( html ) ;
81- const article = dom . window . document . querySelector ( 'article.doc' ) ;
159+ // ---------------------------------------------------------------------------
160+ // Conversion
161+ // ---------------------------------------------------------------------------
82162
83- if ( ! article ) {
84- skipped ++ ;
85- continue ;
86- }
163+ const D2M_OPTIONS = ( dom ) => ( {
164+ overrideDOMParser : new dom . window . DOMParser ( ) ,
165+ extractMainContent : false ,
166+ enableTableColumnTracking : false ,
167+ refifyUrls : false ,
168+ websiteDomain : 'https://www.tiny.cloud' ,
169+ } ) ;
170+
171+ const fixBlankAnchors = ( md ) =>
172+ md . replace ( / a b o u t : b l a n k # / g, '#' ) ;
173+
174+ const toMarkdown = ( articleEl , dom ) => {
175+ const article = preprocess ( articleEl , dom . window . document ) ;
176+ const raw = convertHtmlToMarkdown ( article . innerHTML , D2M_OPTIONS ( dom ) ) ;
177+ return fixBlankAnchors ( raw ) ;
178+ } ;
179+
180+ // ---------------------------------------------------------------------------
181+ // Frontmatter
182+ // ---------------------------------------------------------------------------
183+
184+ const escapeYaml = ( s ) =>
185+ s . replace ( / \\ / g, '\\\\' ) . replace ( / " / g, '\\"' ) ;
186+
187+ const buildFrontmatter = ( title , tokens ) =>
188+ `---\ntitle: "${ escapeYaml ( title ) } "\ntokens: ${ tokens } \n---\n` ;
87189
88- const title = extractTitle ( dom . window . document ) ;
89- const markdown = convertToMarkdown ( article . innerHTML , dom ) ;
90- const tokens = encode ( markdown ) . length ;
91- const frontmatter = buildFrontmatter ( title , tokens ) ;
92- const fullMd = frontmatter + markdown + '\n' ;
190+ // ---------------------------------------------------------------------------
191+ // Title extraction
192+ // ---------------------------------------------------------------------------
93193
94- const mdPath = htmlPath . replace ( / \. h t m l $ / , '.md' ) ;
95- await mkdir ( dirname ( mdPath ) , { recursive : true } ) ;
96- await writeFile ( mdPath , fullMd , 'utf-8' ) ;
194+ const extractTitle = ( doc ) =>
195+ doc . querySelector ( 'article.doc h1' ) ?. textContent ?. trim ( )
196+ ?? doc . querySelector ( 'title' ) ?. textContent ?. trim ( ) ?. replace ( / \| .* $ / , '' )
197+ ?? 'Untitled' ;
97198
98- const urlPath = '/' + relative ( BUILD_DIR , dirname ( htmlPath ) ) + '/' ;
99- manifest [ urlPath ] = tokens ;
100- converted ++ ;
199+ // ---------------------------------------------------------------------------
200+ // File walking
201+ // ---------------------------------------------------------------------------
202+
203+ const collectHtmlFiles = async ( dir ) => {
204+ const entries = await readdir ( dir , { withFileTypes : true } ) ;
205+ const nested = await Promise . all (
206+ entries . map ( ( entry ) => {
207+ const full = join ( dir , entry . name ) ;
208+ return entry . isDirectory ( )
209+ ? collectHtmlFiles ( full )
210+ : entry . name . endsWith ( '.html' ) ? [ full ] : [ ] ;
211+ } )
212+ ) ;
213+ return nested . flat ( ) ;
214+ } ;
215+
216+ // ---------------------------------------------------------------------------
217+ // Single-page conversion
218+ // ---------------------------------------------------------------------------
219+
220+ const convertPage = async ( htmlPath ) => {
221+ const html = await readFile ( htmlPath , 'utf-8' ) ;
222+ const dom = new JSDOM ( html ) ;
223+ const articleEl = dom . window . document . querySelector ( 'article.doc' ) ;
224+
225+ if ( ! articleEl ) return null ;
226+
227+ const title = extractTitle ( dom . window . document ) ;
228+ const markdown = toMarkdown ( articleEl , dom ) ;
229+ const tokens = encode ( markdown ) . length ;
230+ const content = buildFrontmatter ( title , tokens ) + markdown + '\n' ;
231+ const mdPath = htmlPath . replace ( / \. h t m l $ / , '.md' ) ;
232+
233+ await mkdir ( dirname ( mdPath ) , { recursive : true } ) ;
234+ await writeFile ( mdPath , content , 'utf-8' ) ;
235+
236+ return { path : '/' + relative ( BUILD_DIR , dirname ( htmlPath ) ) + '/' , tokens } ;
237+ } ;
238+
239+ // ---------------------------------------------------------------------------
240+ // Main
241+ // ---------------------------------------------------------------------------
242+
243+ const main = async ( ) => {
244+ console . log ( `Generating markdown siblings in ${ BUILD_DIR } …` ) ;
245+
246+ const htmlFiles = await collectHtmlFiles ( BUILD_DIR ) ;
247+ const pages = [ ] ;
248+
249+ for ( const htmlPath of htmlFiles ) {
250+ const result = await convertPage ( htmlPath ) ;
251+ if ( result ) pages . push ( result ) ;
101252 }
102253
254+ const manifest = Object . fromEntries ( pages . map ( ( { path, tokens } ) => [ path , tokens ] ) ) ;
103255 const manifestPath = join ( BUILD_DIR , '_markdown-manifest.json' ) ;
104256 await writeFile ( manifestPath , JSON . stringify ( manifest , null , 2 ) + '\n' , 'utf-8' ) ;
105257
106- console . log (
107- `Done. ${ converted } pages converted, ${ skipped } skipped (no article.doc). Manifest → ${ manifestPath } `
108- ) ;
109- }
258+ const skipped = htmlFiles . length - pages . length ;
259+ console . log ( `Done. ${ pages . length } pages converted, ${ skipped } skipped (no article.doc). Manifest → ${ manifestPath } ` ) ;
260+ } ;
110261
111262main ( ) . catch ( ( err ) => {
112263 console . error ( err ) ;
0 commit comments