diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..a727e4d7e0 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -27,22 +27,59 @@
def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.
+ Uses Mozilla Readability via readabilipy as the primary extraction method.
+ Falls back to readabilipy without Readability (less aggressive filtering)
+ or direct markdownify conversion when Readability strips too much content,
+ which commonly happens with progressive SSR sites that deliver content in
+ hidden containers awaiting client-side hydration.
+
Args:
html: Raw HTML content to process
Returns:
Simplified markdown version of the content
"""
+ # Minimum expected content length as a fraction of input HTML.
+ # If extracted text is shorter than this, Readability likely stripped
+ # meaningful content (e.g. hidden SSR markup).
+ min_expected_length = max(1, len(html) // 100)
+
+ # Stage 1: Try Readability (best quality for standard pages)
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
)
- if not ret["content"]:
- return "Page failed to be simplified from HTML"
+ content_html = ret.get("content", "")
+ if content_html:
+ content = markdownify.markdownify(
+ content_html,
+ heading_style=markdownify.ATX,
+ )
+ if len(content.strip()) >= min_expected_length:
+ return content
+
+ # Stage 2: Try readabilipy without Readability JS (less aggressive,
+ # does not filter by CSS visibility)
+ ret = readabilipy.simple_json.simple_json_from_html_string(
+ html, use_readability=False
+ )
+ content_html = ret.get("content", "")
+ if content_html:
+ content = markdownify.markdownify(
+ content_html,
+ heading_style=markdownify.ATX,
+ )
+ if len(content.strip()) >= min_expected_length:
+ return content
+
+ # Stage 3: Convert full HTML directly with markdownify (last resort)
content = markdownify.markdownify(
- ret["content"],
+ html,
heading_style=markdownify.ATX,
)
- return content
+ if content.strip():
+ return content
+
+ return "Page failed to be simplified from HTML"
def get_robots_txt_url(url: str) -> str:
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..d3ed2a8ee0 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -324,3 +324,91 @@ async def test_fetch_with_proxy(self):
# Verify AsyncClient was called with proxy
mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
+
+
+
+class TestExtractContentFallback:
+ """Tests for the fallback extraction in extract_content_from_html."""
+
+ def test_readability_sufficient_content_no_fallback(self):
+ """When Readability returns enough content, no fallback is triggered."""
+ html = "
" + "word
" * 200 + ""
+ readability_content = ""
+
+ with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+ mock_readability.return_value = {"content": readability_content}
+ result = extract_content_from_html(html)
+ # Should only be called once (Readability path succeeds)
+ assert mock_readability.call_count == 1
+ assert "word" in result
+
+ def test_readability_strips_content_falls_back_to_no_readability(self):
+ """When Readability returns too little, falls back to non-Readability extraction."""
+ # Simulate a large HTML page where Readability strips hidden SSR content
+ html = "" + "content
" * 500 + ""
+
+ def mock_simple_json(h, use_readability=True):
+ if use_readability:
+ # Readability stripped everything, returns almost nothing
+ return {"content": "Loading...
"}
+ else:
+ # Without Readability, returns full content
+ return {"content": ""}
+
+ with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+ result = extract_content_from_html(html)
+ assert "content" in result
+ assert len(result.strip()) > 100
+
+ def test_both_readability_modes_fail_falls_back_to_markdownify(self):
+ """When both readabilipy modes return too little, falls back to raw markdownify."""
+ html = "" + "important data
" * 300 + ""
+
+ with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+ # Both modes return empty/minimal content
+ mock_readability.return_value = {"content": ""}
+ result = extract_content_from_html(html)
+ # Should fall through to markdownify on raw HTML
+ assert "important data" in result
+ assert mock_readability.call_count == 2 # called for both modes
+
+ def test_completely_empty_html_returns_error(self):
+ """Completely empty HTML returns error message."""
+ with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+ mock_readability.return_value = {"content": ""}
+ result = extract_content_from_html("")
+ assert "" in result
+
+ def test_readability_none_content_triggers_fallback(self):
+ """When Readability returns None content, fallback is triggered."""
+ html = "" + "real content
" * 200 + ""
+
+ call_count = [0]
+ def mock_simple_json(h, use_readability=True):
+ call_count[0] += 1
+ if call_count[0] == 1:
+ return {"content": None} # Readability returns None
+ else:
+ return {"content": "" + "
real content
" * 200 + "
"}
+
+ with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+ result = extract_content_from_html(html)
+ assert "real content" in result
+
+ def test_threshold_is_one_percent_of_html(self):
+ """The fallback threshold is 1% of the input HTML length."""
+ # 10000 chars of HTML -> threshold is 100 chars
+ padding = "x" * 9000
+ html = f"{padding}
tiny
"
+
+ def mock_simple_json(h, use_readability=True):
+ if use_readability:
+ # Returns less than 1% of input
+ return {"content": "tiny
"}
+ else:
+ return {"content": f"{padding}
tiny
"}
+
+ with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+ result = extract_content_from_html(html)
+ # Should have triggered fallback since "tiny" (4 chars) < 100 (1% threshold)
+ assert len(result.strip()) > 50