modelcontextprotocol · Christian-Sidak · Apr 12, 2026 · Apr 22, 2026 · Apr 24, 2026 · Apr 25, 2026
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
@@ -3,6 +3,7 @@
 
 import markdownify
 import readabilipy.simple_json
+from bs4 import BeautifulSoup
 from mcp.shared.exceptions import McpError
 from mcp.server import Server
 from mcp.server.stdio import stdio_server
@@ -27,22 +28,59 @@
 def extract_content_from_html(html: str) -> str:
     """Extract and convert HTML content to Markdown format.
 
+    Uses Mozilla Readability via readabilipy as the primary extraction method.
+    Falls back to readabilipy without Readability (less aggressive filtering)
+    or direct markdownify conversion when Readability returns empty content,
+    which commonly happens with progressive SSR sites that deliver content in
+    hidden containers awaiting client-side hydration.
+
     Args:
         html: Raw HTML content to process
 
     Returns:
         Simplified markdown version of the content
     """
+    # Stage 1: Try Readability (best quality for standard pages)
     ret = readabilipy.simple_json.simple_json_from_html_string(
         html, use_readability=True
     )
-    if not ret["content"]:
-        return "<error>Page failed to be simplified from HTML</error>"
+    content_html = ret.get("content", "")
+    if content_html:
+        content = markdownify.markdownify(
+            content_html,
+            heading_style=markdownify.ATX,
+        )
+        if content.strip():
+            return content
+
+    # Stage 2: Try readabilipy without Readability JS (less aggressive,
+    # does not filter by CSS visibility)
+    ret = readabilipy.simple_json.simple_json_from_html_string(
+        html, use_readability=False
+    )
+    content_html = ret.get("content", "")
+    if content_html:
+        content = markdownify.markdownify(
+            content_html,
+            heading_style=markdownify.ATX,
+        )
+        if content.strip():
+            return content
+
+    # Stage 3: Convert full HTML directly with markdownify (last resort).
+    # Strip <script> and <style> first — markdownify renders them verbatim as
+    # plain text, which injects large blobs of JS/CSS noise into the output.
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style"]):
+        tag.decompose()
     content = markdownify.markdownify(
-        ret["content"],
+        str(soup),
         heading_style=markdownify.ATX,
     )
-    return content
+    if content.strip():
+        return content
+
+    return "<error>Page failed to be simplified from HTML</error>"
 
 
 def get_robots_txt_url(url: str) -> str:

diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
@@ -324,3 +324,89 @@ async def test_fetch_with_proxy(self):
 
             # Verify AsyncClient was called with proxy
             mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
+
+
+
+class TestExtractContentFallback:
+    """Tests for the fallback extraction in extract_content_from_html."""
+
+    def test_readability_sufficient_content_no_fallback(self):
+        """When Readability returns enough content, no fallback is triggered."""
+        html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
+        readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": readability_content}
+            result = extract_content_from_html(html)
+            # Should only be called once (Readability path succeeds)
+            assert mock_readability.call_count == 1
+            assert "word" in result
+
+    def test_readability_strips_content_falls_back_to_no_readability(self):
+        """When Readability returns empty content, falls back to non-Readability extraction."""
+        # Simulate an SSR page where Readability strips all hidden containers, returning empty
+        html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                # Readability stripped everything, returns empty string
+                return {"content": ""}
+            else:
+                # Without Readability, returns full content
+                return {"content": "<div>" + "<p>content </p>" * 500 + "</div>"}
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "content" in result
+            assert len(result.strip()) > 100
+
+    def test_both_readability_modes_fail_falls_back_to_markdownify(self):
+        """When both readabilipy modes return too little, falls back to raw markdownify."""
+        html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            # Both modes return empty/minimal content
+            mock_readability.return_value = {"content": ""}
+            result = extract_content_from_html(html)
+            # Should fall through to markdownify on raw HTML
+            assert "important data" in result
+            assert mock_readability.call_count == 2  # called for both modes
+
+    def test_completely_empty_html_returns_error(self):
+        """Completely empty HTML returns error message."""
+        with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": ""}
+            result = extract_content_from_html("")
+            assert "<error>" in result
+
+    def test_readability_none_content_triggers_fallback(self):
+        """When Readability returns None content, fallback is triggered."""
+        html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"
+
+        call_count = [0]
+        def mock_simple_json(h, use_readability=True):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                return {"content": None}  # Readability returns None
+            else:
+                return {"content": "<div>" + "<p>real content </p>" * 200 + "</div>"}
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "real content" in result
+
+    def test_small_readability_output_accepted(self):
+        """Non-empty Readability output is accepted regardless of size ratio."""
+        padding = "x" * 9000
+        html = f"<html><body><div style=\"visibility:hidden\">{padding}</div><p>tiny</p></body></html>"
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                return {"content": "<p>tiny</p>"}
+            else:
+                return {"content": f"<div>{padding}</div><p>tiny</p>"}
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            # Readability returned non-empty content, so it should be used directly
+            assert "tiny" in result