@@ -153,7 +153,7 @@ const linkDensityBlockTags = new Set(['div', 'ol', 'section', 'ul'])
153153
154154function rehypeStripNoise ( profile ?: Profile < Record < string , unknown > > ) {
155155 return ( tree : Root ) => {
156- strip ( tree , false , profile )
156+ strip ( tree , false , false , profile )
157157 }
158158}
159159
@@ -162,13 +162,16 @@ const sectioningTags = new Set(['article', 'main', 'section'])
162162function strip (
163163 node : Element | Root ,
164164 inSectioning = false ,
165+ inContentContainer = false ,
165166 profile ?: Profile < Record < string , unknown > > ,
166167) {
167168 if ( ! node . children ) return
168169 node . children = node . children . filter ( ( child ) => {
169170 if ( child . type === 'comment' ) return false
170171 if ( child . type !== 'element' ) return true
171172 const knownContentRoot = isKnownContentRoot ( child , profile )
173+ const childInContentContainer =
174+ inContentContainer || knownContentRoot || isContentContainer ( child )
172175
173176 if ( strippedTagNames . has ( child . tagName ) ) return false
174177
@@ -183,17 +186,24 @@ function strip(
183186 if ( isSkipLink ( child ) ) return false
184187 if ( ! knownContentRoot && ! containsContentContainer ( child ) && matchesNoiseClassId ( child ) )
185188 return false
186- if ( ! knownContentRoot && isHighLinkDensity ( child ) ) return false
189+ if ( ! childInContentContainer && isHighLinkDensity ( child ) ) return false
187190
188- strip ( child , inSectioning || sectioningTags . has ( child . tagName ) , profile )
191+ strip (
192+ child ,
193+ inSectioning || sectioningTags . has ( child . tagName ) ,
194+ childInContentContainer ,
195+ profile ,
196+ )
189197 return true
190198 } )
191199}
192200
193201function containsContentContainer ( node : Element ) : boolean {
194- return (
195- node . tagName === 'article' || node . tagName === 'main' || hasDescendantContentContainer ( node )
196- )
202+ return isContentContainer ( node ) || hasDescendantContentContainer ( node )
203+ }
204+
205+ function isContentContainer ( node : Element ) : boolean {
206+ return node . tagName === 'article' || node . tagName === 'main' || node . properties ?. role === 'main'
197207}
198208
199209function isKnownContentRoot ( node : Element , profile ?: Profile < Record < string , unknown > > ) : boolean {
@@ -297,7 +307,7 @@ function getLinkTextLength(node: Element): number {
297307function hasDescendantContentContainer ( node : Element ) : boolean {
298308 for ( const child of node . children ) {
299309 if ( child . type !== 'element' ) continue
300- if ( child . tagName === 'article' || child . tagName === 'main' ) return true
310+ if ( isContentContainer ( child ) ) return true
301311 if ( hasDescendantContentContainer ( child ) ) return true
302312 }
303313 return false
0 commit comments