fix(fetch): fall back when Readability strips hidden SSR content

Christian-Sidak · Christian-Sidak · commit a5ae26718d17 · 2026-04-12T11:25:41.000-07:00
Add a three-stage extraction pipeline to extract_content_from_html(): 1. Readability (existing, best quality for standard pages) 2. readabilipy without Readability JS (less aggressive, no CSS visibility filtering) 3. Raw markdownify conversion (last resort) Stages 2 and 3 only activate when stage 1 produces text shorter than 1% of the input HTML length, which indicates Readability stripped meaningful content. This commonly happens with progressive SSR sites that deliver content in hidden containers (visibility:hidden, position:absolute) awaiting client-side hydration. No new dependencies. No behavior change for sites where Readability works correctly. Fixes #3878
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
@@ -27,22 +27,59 @@
 def extract_content_from_html(html: str) -> str:
     """Extract and convert HTML content to Markdown format.
 
+    Uses Mozilla Readability via readabilipy as the primary extraction method.
+    Falls back to readabilipy without Readability (less aggressive filtering)
+    or direct markdownify conversion when Readability strips too much content,
+    which commonly happens with progressive SSR sites that deliver content in
+    hidden containers awaiting client-side hydration.
+
     Args:
         html: Raw HTML content to process
 
     Returns:
         Simplified markdown version of the content
     """
+    # Minimum expected content length as a fraction of input HTML.
+    # If extracted text is shorter than this, Readability likely stripped
+    # meaningful content (e.g. hidden SSR markup).
+    min_expected_length = max(1, len(html) // 100)
+
+    # Stage 1: Try Readability (best quality for standard pages)
     ret = readabilipy.simple_json.simple_json_from_html_string(
         html, use_readability=True
     )
-    if not ret["content"]:
-        return "<error>Page failed to be simplified from HTML</error>"
+    content_html = ret.get("content", "")
+    if content_html:
+        content = markdownify.markdownify(
+            content_html,
+            heading_style=markdownify.ATX,
+        )
+        if len(content.strip()) >= min_expected_length:
+            return content
+
+    # Stage 2: Try readabilipy without Readability JS (less aggressive,
+    # does not filter by CSS visibility)
+    ret = readabilipy.simple_json.simple_json_from_html_string(
+        html, use_readability=False
+    )
+    content_html = ret.get("content", "")
+    if content_html:
+        content = markdownify.markdownify(
+            content_html,
+            heading_style=markdownify.ATX,
+        )
+        if len(content.strip()) >= min_expected_length:
+            return content
+
+    # Stage 3: Convert full HTML directly with markdownify (last resort)
     content = markdownify.markdownify(
-        ret["content"],
+        html,
         heading_style=markdownify.ATX,
     )
-    return content
+    if content.strip():
+        return content
+
+    return "<error>Page failed to be simplified from HTML</error>"
 
 
 def get_robots_txt_url(url: str) -> str:
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
@@ -324,3 +324,91 @@ async def test_fetch_with_proxy(self):
 
             # Verify AsyncClient was called with proxy
             mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
+
+
+
+class TestExtractContentFallback:
+    """Tests for the fallback extraction in extract_content_from_html."""
+
+    def test_readability_sufficient_content_no_fallback(self):
+        """When Readability returns enough content, no fallback is triggered."""
+        html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
+        readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": readability_content}
+            result = extract_content_from_html(html)
+            # Should only be called once (Readability path succeeds)
+            assert mock_readability.call_count == 1
+            assert "word" in result
+
+    def test_readability_strips_content_falls_back_to_no_readability(self):
+        """When Readability returns too little, falls back to non-Readability extraction."""
+        # Simulate a large HTML page where Readability strips hidden SSR content
+        html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                # Readability stripped everything, returns almost nothing
+                return {"content": "<div>Loading...</div>"}
+            else:
+                # Without Readability, returns full content
+                return {"content": "<div>" + "<p>content </p>" * 500 + "</div>"}
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "content" in result
+            assert len(result.strip()) > 100
+
+    def test_both_readability_modes_fail_falls_back_to_markdownify(self):
+        """When both readabilipy modes return too little, falls back to raw markdownify."""
+        html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            # Both modes return empty/minimal content
+            mock_readability.return_value = {"content": ""}
+            result = extract_content_from_html(html)
+            # Should fall through to markdownify on raw HTML
+            assert "important data" in result
+            assert mock_readability.call_count == 2  # called for both modes
+
+    def test_completely_empty_html_returns_error(self):
+        """Completely empty HTML returns error message."""
+        with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": ""}
+            result = extract_content_from_html("")
+            assert "<error>" in result
+
+    def test_readability_none_content_triggers_fallback(self):
+        """When Readability returns None content, fallback is triggered."""
+        html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"
+
+        call_count = [0]
+        def mock_simple_json(h, use_readability=True):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                return {"content": None}  # Readability returns None
+            else:
+                return {"content": "<div>" + "<p>real content </p>" * 200 + "</div>"}
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "real content" in result
+
+    def test_threshold_is_one_percent_of_html(self):
+        """The fallback threshold is 1% of the input HTML length."""
+        # 10000 chars of HTML -> threshold is 100 chars
+        padding = "x" * 9000
+        html = f"<html><body><div style=\"visibility:hidden\">{padding}</div><p>tiny</p></body></html>"
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                # Returns less than 1% of input
+                return {"content": "<p>tiny</p>"}
+            else:
+                return {"content": f"<div>{padding}</div><p>tiny</p>"}
+
+        with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            # Should have triggered fallback since "tiny" (4 chars) < 100 (1% threshold)
+            assert len(result.strip()) > 50