11const MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8" ;
22const TEXT_NODE = 3 ;
33const ELEMENT_NODE = 1 ;
4+ const NOISE_SELECTORS = [
5+ "script" ,
6+ "style" ,
7+ "noscript" ,
8+ "template" ,
9+ "svg" ,
10+ "header" ,
11+ "footer" ,
12+ "nav" ,
13+ "aside" ,
14+ ".site-header" ,
15+ ".site-footer" ,
16+ ".side-bar" ,
17+ ".secondary-nav" ,
18+ ".search-input-wrap" ,
19+ ".social-media-list" ,
20+ ".footer-nav" ,
21+ ".skip-to-content-link" ,
22+ ".header-anchor" ,
23+ ".next-previous" ,
24+ ".next-previous--feedback" ,
25+ ".home-banner-info" ,
26+ "#secondary-nav" ,
27+ "#site-nav" ,
28+ "#toggled-search" ,
29+ "[aria-label='Main navigation']" ,
30+ "[aria-label='Footer navigation']" ,
31+ ] ;
432
533export default async ( request : Request , context : { next : ( ) => Promise < Response > } ) => {
634 const response = await context . next ( ) ;
@@ -76,18 +104,15 @@ function htmlToMarkdown(html: string, baseUrl: URL): string {
76104 return htmlToMarkdownFallback ( html , baseUrl ) ;
77105 }
78106
79- for ( const selector of [ "script" , "style" , "noscript" ] ) {
80- doc . querySelectorAll ( selector ) . forEach ( ( node : any ) => node . remove ( ) ) ;
81- }
107+ pruneDocument ( doc ) ;
82108
83109 const title = normalizeWhitespace ( doc . querySelector ( "title" ) ?. textContent || "" ) ;
84- const body = doc . body ;
85- const bodyMarkdown = body ? renderChildren ( body , baseUrl ) : "" ;
110+ const contentRoot = selectContentRoot ( doc ) ;
111+ const bodyMarkdown = contentRoot ? renderChildren ( contentRoot , baseUrl ) : "" ;
86112
87113 const parts : string [ ] = [ ] ;
88114 if ( title ) {
89115 parts . push ( `---\ntitle: ${ title } \n---` ) ;
90- parts . push ( `# ${ title } ` ) ;
91116 }
92117
93118 if ( bodyMarkdown ) {
@@ -102,19 +127,20 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string {
102127 . replace ( / < ! - - [ \s \S ] * ?- - > / g, "" )
103128 . replace ( / < s c r i p t \b [ ^ > ] * > [ \s \S ] * ?< \/ s c r i p t > / gi, "" )
104129 . replace ( / < s t y l e \b [ ^ > ] * > [ \s \S ] * ?< \/ s t y l e > / gi, "" )
105- . replace ( / < n o s c r i p t \b [ ^ > ] * > [ \s \S ] * ?< \/ n o s c r i p t > / gi, "" ) ;
130+ . replace ( / < n o s c r i p t \b [ ^ > ] * > [ \s \S ] * ?< \/ n o s c r i p t > / gi, "" )
131+ . replace ( / < s v g \b [ ^ > ] * > [ \s \S ] * ?< \/ s v g > / gi, "" ) ;
106132
107133 const titleMatch = sanitized . match ( / < t i t l e \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ t i t l e > / i) ;
108134 const title = titleMatch ? normalizeWhitespace ( decodeHtmlEntities ( stripTags ( titleMatch [ 1 ] ) ) ) : "" ;
109135
110136 const bodyMatch = sanitized . match ( / < b o d y \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ b o d y > / i) ;
111- const bodyHtml = bodyMatch ? bodyMatch [ 1 ] : sanitized ;
137+ const rawBodyHtml = bodyMatch ? bodyMatch [ 1 ] : sanitized ;
138+ const bodyHtml = extractPrimaryHtmlFragment ( rawBodyHtml ) ;
112139 const bodyMarkdown = renderHtmlFragmentToMarkdown ( bodyHtml , baseUrl ) ;
113140
114141 const parts : string [ ] = [ ] ;
115142 if ( title ) {
116143 parts . push ( `---\ntitle: ${ title } \n---` ) ;
117- parts . push ( `# ${ title } ` ) ;
118144 }
119145
120146 if ( bodyMarkdown ) {
@@ -126,7 +152,7 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string {
126152
127153function renderHtmlFragmentToMarkdown ( html : string , baseUrl : URL ) : string {
128154 const protectedBlocks : string [ ] = [ ] ;
129- let content = html ;
155+ let content = stripFallbackNoise ( html ) ;
130156
131157 // Protect code blocks so later tag cleanup does not alter their content.
132158 content = content . replace ( / < p r e \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ p r e > / gi, ( _match , preContent : string ) => {
@@ -137,18 +163,10 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
137163 } ) ;
138164
139165 content = content . replace ( / < h ( [ 1 - 6 ] ) \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ h \1> / gi, ( _match , level : string , text : string ) => {
140- const clean = decodeHtmlEntities ( stripTags ( text ) ) . trim ( ) ;
166+ const clean = normalizeWhitespace ( decodeHtmlEntities ( stripTags ( text ) ) ) ;
141167 return clean ? `\n\n${ "#" . repeat ( Number ( level ) ) } ${ clean } \n\n` : "" ;
142168 } ) ;
143169
144- content = content . replace ( / < a \b ( [ ^ > ] * ) > ( [ \s \S ] * ?) < \/ a > / gi, ( _match , attrs : string , text : string ) => {
145- const hrefMatch = attrs . match ( / h r e f \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / i) ;
146- const rawHref = hrefMatch ? hrefMatch [ 1 ] || hrefMatch [ 2 ] || hrefMatch [ 3 ] || "" : "" ;
147- const href = toAbsoluteUrl ( rawHref , baseUrl ) ;
148- const label = normalizeWhitespace ( decodeHtmlEntities ( stripTags ( text ) ) ) || href ;
149- return href ? `[${ label } ](${ href } )` : label ;
150- } ) ;
151-
152170 content = content . replace ( / < i m g \b ( [ ^ > ] * ) > / gi, ( _match , attrs : string ) => {
153171 const altMatch = attrs . match ( / a l t \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / i) ;
154172 const srcMatch = attrs . match ( / s r c \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / i) ;
@@ -159,6 +177,18 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
159177 return src ? `` : "" ;
160178 } ) ;
161179
180+ content = content . replace ( / < a \b ( [ ^ > ] * ) > ( [ \s \S ] * ?) < \/ a > / gi, ( _match , attrs : string , text : string ) => {
181+ const hrefMatch = attrs . match ( / h r e f \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / i) ;
182+ const rawHref = hrefMatch ? hrefMatch [ 1 ] || hrefMatch [ 2 ] || hrefMatch [ 3 ] || "" : "" ;
183+ const href = toAbsoluteUrl ( rawHref , baseUrl ) ;
184+ const label = normalizeWhitespace ( decodeHtmlEntities ( stripTags ( text ) ) ) ;
185+ if ( ! href || ! label ) {
186+ return label ;
187+ }
188+
189+ return `[${ label } ](${ href } )` ;
190+ } ) ;
191+
162192 content = content
163193 . replace ( / < ( s t r o n g | b ) \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ \1> / gi, ( _m , _tag : string , text : string ) => {
164194 const clean = normalizeWhitespace ( decodeHtmlEntities ( stripTags ( text ) ) ) ;
@@ -185,7 +215,7 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string {
185215 content = decodeHtmlEntities ( stripTags ( content ) )
186216 . replace ( / \n { 3 , } / g, "\n\n" )
187217 . split ( "\n" )
188- . map ( ( line ) => line . trimEnd ( ) )
218+ . map ( ( line ) => line . trim ( ) )
189219 . join ( "\n" )
190220 . trim ( ) ;
191221
@@ -244,8 +274,12 @@ function renderNode(node: any, baseUrl: URL, listDepth: number): string {
244274 }
245275
246276 if ( tag === "a" ) {
247- const text = renderInlineChildren ( el , baseUrl ) || normalizeWhitespace ( el . getAttribute ( "href" ) || "" ) ;
277+ const text = renderInlineChildren ( el , baseUrl ) ;
248278 const href = toAbsoluteUrl ( el . getAttribute ( "href" ) , baseUrl ) ;
279+ if ( ! text ) {
280+ return "" ;
281+ }
282+
249283 return href ? `[${ text } ](${ href } )` : text ;
250284 }
251285
@@ -349,9 +383,20 @@ function renderInlineChildren(parent: any, baseUrl: URL): string {
349383 const tag = el . tagName . toLowerCase ( ) ;
350384
351385 if ( tag === "a" ) {
352- const text = renderInlineChildren ( el , baseUrl ) || normalizeWhitespace ( el . getAttribute ( "href" ) || "" ) ;
386+ const text = renderInlineChildren ( el , baseUrl ) ;
353387 const href = toAbsoluteUrl ( el . getAttribute ( "href" ) , baseUrl ) ;
354- out . push ( href ? `[${ text } ](${ href } )` : text ) ;
388+ if ( text ) {
389+ out . push ( href ? `[${ text } ](${ href } )` : text ) ;
390+ }
391+ continue ;
392+ }
393+
394+ if ( tag === "img" ) {
395+ const alt = normalizeWhitespace ( el . getAttribute ( "alt" ) || "image" ) ;
396+ const src = toAbsoluteUrl ( el . getAttribute ( "src" ) , baseUrl ) ;
397+ if ( src ) {
398+ out . push ( `` ) ;
399+ }
355400 continue ;
356401 }
357402
@@ -408,3 +453,70 @@ function toAbsoluteUrl(href: string | null, baseUrl: URL): string {
408453function normalizeWhitespace ( value : string ) : string {
409454 return value . replace ( / \s + / g, " " ) . trim ( ) ;
410455}
456+
457+ function pruneDocument ( doc : { querySelectorAll : ( selector : string ) => any } ) : void {
458+ for ( const selector of NOISE_SELECTORS ) {
459+ doc . querySelectorAll ( selector ) . forEach ( ( node : any ) => node . remove ( ) ) ;
460+ }
461+ }
462+
463+ function selectContentRoot ( doc : { querySelector : ( selector : string ) => any ; body ?: any } ) : any {
464+ const main = doc . querySelector ( "main" ) ;
465+ if ( main && / \b h o m e - p a g e \b / . test ( main . getAttribute ?.( "class" ) || "" ) ) {
466+ return main ;
467+ }
468+
469+ const preferredSelectors = [
470+ "#main article" ,
471+ "main article" ,
472+ "article.guide" ,
473+ "article" ,
474+ "#main" ,
475+ "[role='main']" ,
476+ "main" ,
477+ "body" ,
478+ ] ;
479+
480+ for ( const selector of preferredSelectors ) {
481+ const found = doc . querySelector ( selector ) ;
482+ if ( found ) {
483+ return found ;
484+ }
485+ }
486+
487+ return doc . body || null ;
488+ }
489+
490+ function extractPrimaryHtmlFragment ( html : string ) : string {
491+ const mainMatch = html . match ( / < m a i n \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ m a i n > / i) ;
492+ const mainHtml = mainMatch ? mainMatch [ 1 ] : "" ;
493+ const isHome = / < m a i n \b [ ^ > ] * c l a s s = ( " | ' ) [ ^ " ' ] * \b h o m e - p a g e \b [ ^ " ' ] * \1/ i. test ( html ) ;
494+
495+ if ( isHome && mainHtml ) {
496+ return mainHtml ;
497+ }
498+
499+ const articleMatch = html . match ( / < a r t i c l e \b [ ^ > ] * > ( [ \s \S ] * ?) < \/ a r t i c l e > / i) ;
500+ if ( articleMatch ) {
501+ return articleMatch [ 1 ] ;
502+ }
503+
504+ if ( mainHtml ) {
505+ return mainHtml ;
506+ }
507+
508+ return html ;
509+ }
510+
511+ function stripFallbackNoise ( html : string ) : string {
512+ return html
513+ . replace ( / < h e a d e r \b [ ^ > ] * > [ \s \S ] * ?< \/ h e a d e r > / gi, "" )
514+ . replace ( / < f o o t e r \b [ ^ > ] * > [ \s \S ] * ?< \/ f o o t e r > / gi, "" )
515+ . replace ( / < n a v \b [ ^ > ] * > [ \s \S ] * ?< \/ n a v > / gi, "" )
516+ . replace ( / < a s i d e \b [ ^ > ] * > [ \s \S ] * ?< \/ a s i d e > / gi, "" )
517+ . replace ( / < d i v \b [ ^ > ] * c l a s s = ( " | ' ) [ ^ " ' ] * \b n e x t - p r e v i o u s \b [ ^ " ' ] * \1[ ^ > ] * > [ \s \S ] * ?< \/ d i v > / gi, "" )
518+ . replace ( / < s e c t i o n \b [ ^ > ] * c l a s s = ( " | ' ) [ ^ " ' ] * \b h o m e - b a n n e r - i n f o \b [ ^ " ' ] * \1[ ^ > ] * > [ \s \S ] * ?< \/ s e c t i o n > / gi, "" )
519+ . replace ( / < d i v \b [ ^ > ] * c l a s s = ( " | ' ) [ ^ " ' ] * \b s i d e - b a r \b [ ^ " ' ] * \1[ ^ > ] * > [ \s \S ] * ?< \/ d i v > / gi, "" )
520+ . replace ( / < a \b [ ^ > ] * c l a s s = ( " | ' ) [ ^ " ' ] * \b s k i p - t o - c o n t e n t - l i n k \b [ ^ " ' ] * \1[ ^ > ] * > [ \s \S ] * ?< \/ a > / gi, "" )
521+ . replace ( / < a \b [ ^ > ] * c l a s s = ( " | ' ) [ ^ " ' ] * \b h e a d e r - a n c h o r \b [ ^ " ' ] * \1[ ^ > ] * > [ \s \S ] * ?< \/ a > / gi, "" ) ;
522+ }
0 commit comments