Skip to content

Commit a5ae267

Browse files
fix(fetch): fall back when Readability strips hidden SSR content
Add a three-stage extraction pipeline to extract_content_from_html(): 1. Readability (existing, best quality for standard pages) 2. readabilipy without Readability JS (less aggressive, no CSS visibility filtering) 3. Raw markdownify conversion (last resort) Stages 2 and 3 only activate when stage 1 produces text shorter than 1% of the input HTML length, which indicates Readability stripped meaningful content. This commonly happens with progressive SSR sites that deliver content in hidden containers (visibility:hidden, position:absolute) awaiting client-side hydration. No new dependencies. No behavior change for sites where Readability works correctly. Fixes #3878
1 parent f424458 commit a5ae267

2 files changed

Lines changed: 129 additions & 4 deletions

File tree

src/fetch/src/mcp_server_fetch/server.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,59 @@
2727
def extract_content_from_html(html: str) -> str:
2828
"""Extract and convert HTML content to Markdown format.
2929
30+
Uses Mozilla Readability via readabilipy as the primary extraction method.
31+
Falls back to readabilipy without Readability (less aggressive filtering)
32+
or direct markdownify conversion when Readability strips too much content,
33+
which commonly happens with progressive SSR sites that deliver content in
34+
hidden containers awaiting client-side hydration.
35+
3036
Args:
3137
html: Raw HTML content to process
3238
3339
Returns:
3440
Simplified markdown version of the content
3541
"""
42+
# Minimum expected content length as a fraction of input HTML.
43+
# If extracted text is shorter than this, Readability likely stripped
44+
# meaningful content (e.g. hidden SSR markup).
45+
min_expected_length = max(1, len(html) // 100)
46+
47+
# Stage 1: Try Readability (best quality for standard pages)
3648
ret = readabilipy.simple_json.simple_json_from_html_string(
3749
html, use_readability=True
3850
)
39-
if not ret["content"]:
40-
return "<error>Page failed to be simplified from HTML</error>"
51+
content_html = ret.get("content", "")
52+
if content_html:
53+
content = markdownify.markdownify(
54+
content_html,
55+
heading_style=markdownify.ATX,
56+
)
57+
if len(content.strip()) >= min_expected_length:
58+
return content
59+
60+
# Stage 2: Try readabilipy without Readability JS (less aggressive,
61+
# does not filter by CSS visibility)
62+
ret = readabilipy.simple_json.simple_json_from_html_string(
63+
html, use_readability=False
64+
)
65+
content_html = ret.get("content", "")
66+
if content_html:
67+
content = markdownify.markdownify(
68+
content_html,
69+
heading_style=markdownify.ATX,
70+
)
71+
if len(content.strip()) >= min_expected_length:
72+
return content
73+
74+
# Stage 3: Convert full HTML directly with markdownify (last resort)
4175
content = markdownify.markdownify(
42-
ret["content"],
76+
html,
4377
heading_style=markdownify.ATX,
4478
)
45-
return content
79+
if content.strip():
80+
return content
81+
82+
return "<error>Page failed to be simplified from HTML</error>"
4683

4784

4885
def get_robots_txt_url(url: str) -> str:

src/fetch/tests/test_server.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,91 @@ async def test_fetch_with_proxy(self):
324324

325325
# Verify AsyncClient was called with proxy
326326
mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
327+
328+
329+
330+
class TestExtractContentFallback:
331+
"""Tests for the fallback extraction in extract_content_from_html."""
332+
333+
def test_readability_sufficient_content_no_fallback(self):
334+
"""When Readability returns enough content, no fallback is triggered."""
335+
html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
336+
readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"
337+
338+
with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
339+
mock_readability.return_value = {"content": readability_content}
340+
result = extract_content_from_html(html)
341+
# Should only be called once (Readability path succeeds)
342+
assert mock_readability.call_count == 1
343+
assert "word" in result
344+
345+
def test_readability_strips_content_falls_back_to_no_readability(self):
346+
"""When Readability returns too little, falls back to non-Readability extraction."""
347+
# Simulate a large HTML page where Readability strips hidden SSR content
348+
html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"
349+
350+
def mock_simple_json(h, use_readability=True):
351+
if use_readability:
352+
# Readability stripped everything, returns almost nothing
353+
return {"content": "<div>Loading...</div>"}
354+
else:
355+
# Without Readability, returns full content
356+
return {"content": "<div>" + "<p>content </p>" * 500 + "</div>"}
357+
358+
with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
359+
result = extract_content_from_html(html)
360+
assert "content" in result
361+
assert len(result.strip()) > 100
362+
363+
def test_both_readability_modes_fail_falls_back_to_markdownify(self):
364+
"""When both readabilipy modes return too little, falls back to raw markdownify."""
365+
html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"
366+
367+
with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
368+
# Both modes return empty/minimal content
369+
mock_readability.return_value = {"content": ""}
370+
result = extract_content_from_html(html)
371+
# Should fall through to markdownify on raw HTML
372+
assert "important data" in result
373+
assert mock_readability.call_count == 2 # called for both modes
374+
375+
def test_completely_empty_html_returns_error(self):
376+
"""Completely empty HTML returns error message."""
377+
with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
378+
mock_readability.return_value = {"content": ""}
379+
result = extract_content_from_html("")
380+
assert "<error>" in result
381+
382+
def test_readability_none_content_triggers_fallback(self):
383+
"""When Readability returns None content, fallback is triggered."""
384+
html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"
385+
386+
call_count = [0]
387+
def mock_simple_json(h, use_readability=True):
388+
call_count[0] += 1
389+
if call_count[0] == 1:
390+
return {"content": None} # Readability returns None
391+
else:
392+
return {"content": "<div>" + "<p>real content </p>" * 200 + "</div>"}
393+
394+
with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
395+
result = extract_content_from_html(html)
396+
assert "real content" in result
397+
398+
def test_threshold_is_one_percent_of_html(self):
399+
"""The fallback threshold is 1% of the input HTML length."""
400+
# 10000 chars of HTML -> threshold is 100 chars
401+
padding = "x" * 9000
402+
html = f"<html><body><div style=\"visibility:hidden\">{padding}</div><p>tiny</p></body></html>"
403+
404+
def mock_simple_json(h, use_readability=True):
405+
if use_readability:
406+
# Returns less than 1% of input
407+
return {"content": "<p>tiny</p>"}
408+
else:
409+
return {"content": f"<div>{padding}</div><p>tiny</p>"}
410+
411+
with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
412+
result = extract_content_from_html(html)
413+
# Should have triggered fallback since "tiny" (4 chars) < 100 (1% threshold)
414+
assert len(result.strip()) > 50

0 commit comments

Comments
 (0)