Skip to content

Commit 3fba659

Browse files
committed
fix(md): <main> content stripping
1 parent b4f289b commit 3fba659

2 files changed

Lines changed: 45 additions & 7 deletions

File tree

src/md/fromHtml.test.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,34 @@ describe('strips form elements', () => {
616616
expect(result).not.toContain('Page 0')
617617
})
618618

619+
test('preserves high link density content inside main', async () => {
620+
const links = Array.from(
621+
{ length: 10 },
622+
(_, i) => `<a href="/news/${i}">Research update ${i}</a>`,
623+
).join(' ')
624+
const { content: result } = await fromHtml(
625+
html({
626+
body: `<main><h1>News</h1><section>${links}</section></main>`,
627+
}),
628+
)
629+
expect(result).toContain('# News')
630+
expect(result).toContain('Research update 0')
631+
})
632+
633+
test('preserves high link density content inside role main', async () => {
634+
const links = Array.from(
635+
{ length: 10 },
636+
(_, i) => `<a href="/news/${i}">Product announcement ${i}</a>`,
637+
).join(' ')
638+
const { content: result } = await fromHtml(
639+
html({
640+
body: `<div role="main"><h1>Updates</h1><section>${links}</section></div>`,
641+
}),
642+
)
643+
expect(result).toContain('# Updates')
644+
expect(result).toContain('Product announcement 0')
645+
})
646+
619647
test('preserves content wrappers that contain an article', async () => {
620648
const links = Array.from(
621649
{ length: 10 },

src/md/fromHtml.ts

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ const linkDensityBlockTags = new Set(['div', 'ol', 'section', 'ul'])
153153

154154
function rehypeStripNoise(profile?: Profile<Record<string, unknown>>) {
155155
return (tree: Root) => {
156-
strip(tree, false, profile)
156+
strip(tree, false, false, profile)
157157
}
158158
}
159159

@@ -162,13 +162,16 @@ const sectioningTags = new Set(['article', 'main', 'section'])
162162
function strip(
163163
node: Element | Root,
164164
inSectioning = false,
165+
inContentContainer = false,
165166
profile?: Profile<Record<string, unknown>>,
166167
) {
167168
if (!node.children) return
168169
node.children = node.children.filter((child) => {
169170
if (child.type === 'comment') return false
170171
if (child.type !== 'element') return true
171172
const knownContentRoot = isKnownContentRoot(child, profile)
173+
const childInContentContainer =
174+
inContentContainer || knownContentRoot || isContentContainer(child)
172175

173176
if (strippedTagNames.has(child.tagName)) return false
174177

@@ -183,17 +186,24 @@ function strip(
183186
if (isSkipLink(child)) return false
184187
if (!knownContentRoot && !containsContentContainer(child) && matchesNoiseClassId(child))
185188
return false
186-
if (!knownContentRoot && isHighLinkDensity(child)) return false
189+
if (!childInContentContainer && isHighLinkDensity(child)) return false
187190

188-
strip(child, inSectioning || sectioningTags.has(child.tagName), profile)
191+
strip(
192+
child,
193+
inSectioning || sectioningTags.has(child.tagName),
194+
childInContentContainer,
195+
profile,
196+
)
189197
return true
190198
})
191199
}
192200

193201
function containsContentContainer(node: Element): boolean {
194-
return (
195-
node.tagName === 'article' || node.tagName === 'main' || hasDescendantContentContainer(node)
196-
)
202+
return isContentContainer(node) || hasDescendantContentContainer(node)
203+
}
204+
205+
function isContentContainer(node: Element): boolean {
206+
return node.tagName === 'article' || node.tagName === 'main' || node.properties?.role === 'main'
197207
}
198208

199209
function isKnownContentRoot(node: Element, profile?: Profile<Record<string, unknown>>): boolean {
@@ -297,7 +307,7 @@ function getLinkTextLength(node: Element): number {
297307
function hasDescendantContentContainer(node: Element): boolean {
298308
for (const child of node.children) {
299309
if (child.type !== 'element') continue
300-
if (child.tagName === 'article' || child.tagName === 'main') return true
310+
if (isContentContainer(child)) return true
301311
if (hasDescendantContentContainer(child)) return true
302312
}
303313
return false

0 commit comments

Comments
 (0)