diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..a727e4d7e0 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -27,22 +27,59 @@ def extract_content_from_html(html: str) -> str: """Extract and convert HTML content to Markdown format. + Uses Mozilla Readability via readabilipy as the primary extraction method. + Falls back to readabilipy without Readability (less aggressive filtering) + or direct markdownify conversion when Readability strips too much content, + which commonly happens with progressive SSR sites that deliver content in + hidden containers awaiting client-side hydration. + Args: html: Raw HTML content to process Returns: Simplified markdown version of the content """ + # Minimum expected content length as a fraction of input HTML. + # If extracted text is shorter than this, Readability likely stripped + # meaningful content (e.g. hidden SSR markup). + min_expected_length = max(1, len(html) // 100) + + # Stage 1: Try Readability (best quality for standard pages) ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["content"]: - return "Page failed to be simplified from HTML" + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + if len(content.strip()) >= min_expected_length: + return content + + # Stage 2: Try readabilipy without Readability JS (less aggressive, + # does not filter by CSS visibility) + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + if len(content.strip()) >= min_expected_length: + return content + + # Stage 3: Convert full HTML directly with markdownify (last resort) content = markdownify.markdownify( - ret["content"], + html, heading_style=markdownify.ATX, ) - return content + if content.strip(): + return content + + return "Page failed to be simplified from HTML" def get_robots_txt_url(url: str) -> str: diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..d3ed2a8ee0 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -324,3 +324,91 @@ async def test_fetch_with_proxy(self): # Verify AsyncClient was called with proxy mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080") + + + +class TestExtractContentFallback: + """Tests for the fallback extraction in extract_content_from_html.""" + + def test_readability_sufficient_content_no_fallback(self): + """When Readability returns enough content, no fallback is triggered.""" + html = "" + "

word

" * 200 + "" + readability_content = "
" + "

word

" * 200 + "
" + + with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": readability_content} + result = extract_content_from_html(html) + # Should only be called once (Readability path succeeds) + assert mock_readability.call_count == 1 + assert "word" in result + + def test_readability_strips_content_falls_back_to_no_readability(self): + """When Readability returns too little, falls back to non-Readability extraction.""" + # Simulate a large HTML page where Readability strips hidden SSR content + html = "" + "

content

" * 500 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + # Readability stripped everything, returns almost nothing + return {"content": "
Loading...
"} + else: + # Without Readability, returns full content + return {"content": "
" + "

content

" * 500 + "
"} + + with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "content" in result + assert len(result.strip()) > 100 + + def test_both_readability_modes_fail_falls_back_to_markdownify(self): + """When both readabilipy modes return too little, falls back to raw markdownify.""" + html = "" + "

important data

" * 300 + "" + + with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + # Both modes return empty/minimal content + mock_readability.return_value = {"content": ""} + result = extract_content_from_html(html) + # Should fall through to markdownify on raw HTML + assert "important data" in result + assert mock_readability.call_count == 2 # called for both modes + + def test_completely_empty_html_returns_error(self): + """Completely empty HTML returns error message.""" + with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": ""} + result = extract_content_from_html("") + assert "" in result + + def test_readability_none_content_triggers_fallback(self): + """When Readability returns None content, fallback is triggered.""" + html = "" + "

real content

" * 200 + "" + + call_count = [0] + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {"content": None} # Readability returns None + else: + return {"content": "
" + "

real content

" * 200 + "
"} + + with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "real content" in result + + def test_threshold_is_one_percent_of_html(self): + """The fallback threshold is 1% of the input HTML length.""" + # 10000 chars of HTML -> threshold is 100 chars + padding = "x" * 9000 + html = f"
{padding}

tiny

" + + def mock_simple_json(h, use_readability=True): + if use_readability: + # Returns less than 1% of input + return {"content": "

tiny

"} + else: + return {"content": f"
{padding}

tiny

"} + + with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # Should have triggered fallback since "tiny" (4 chars) < 100 (1% threshold) + assert len(result.strip()) > 50