Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,59 @@
def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.

Uses Mozilla Readability via readabilipy as the primary extraction method.
Falls back to readabilipy without Readability (less aggressive filtering)
or direct markdownify conversion when Readability strips too much content,
which commonly happens with progressive SSR sites that deliver content in
hidden containers awaiting client-side hydration.

Args:
html: Raw HTML content to process

Returns:
Simplified markdown version of the content
"""
# Minimum expected content length as a fraction of input HTML.
# If extracted text is shorter than this, Readability likely stripped
# meaningful content (e.g. hidden SSR markup).
min_expected_length = max(1, len(html) // 100)

# Stage 1: Try Readability (best quality for standard pages)
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
)
if not ret["content"]:
return "<error>Page failed to be simplified from HTML</error>"
content_html = ret.get("content", "")
if content_html:
content = markdownify.markdownify(
content_html,
heading_style=markdownify.ATX,
)
if len(content.strip()) >= min_expected_length:
return content

# Stage 2: Try readabilipy without Readability JS (less aggressive,
# does not filter by CSS visibility)
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=False
)
content_html = ret.get("content", "")
if content_html:
content = markdownify.markdownify(
content_html,
heading_style=markdownify.ATX,
)
if len(content.strip()) >= min_expected_length:
return content

# Stage 3: Convert full HTML directly with markdownify (last resort)
content = markdownify.markdownify(
ret["content"],
html,
heading_style=markdownify.ATX,
)
return content
if content.strip():
return content

return "<error>Page failed to be simplified from HTML</error>"


def get_robots_txt_url(url: str) -> str:
Expand Down
88 changes: 88 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,91 @@ async def test_fetch_with_proxy(self):

# Verify AsyncClient was called with proxy
mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")



class TestExtractContentFallback:
"""Tests for the fallback extraction in extract_content_from_html."""

def test_readability_sufficient_content_no_fallback(self):
"""When Readability returns enough content, no fallback is triggered."""
html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"

with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
mock_readability.return_value = {"content": readability_content}
result = extract_content_from_html(html)
# Should only be called once (Readability path succeeds)
assert mock_readability.call_count == 1
assert "word" in result

def test_readability_strips_content_falls_back_to_no_readability(self):
"""When Readability returns too little, falls back to non-Readability extraction."""
# Simulate a large HTML page where Readability strips hidden SSR content
html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"

def mock_simple_json(h, use_readability=True):
if use_readability:
# Readability stripped everything, returns almost nothing
return {"content": "<div>Loading...</div>"}
else:
# Without Readability, returns full content
return {"content": "<div>" + "<p>content </p>" * 500 + "</div>"}

with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
result = extract_content_from_html(html)
assert "content" in result
assert len(result.strip()) > 100

def test_both_readability_modes_fail_falls_back_to_markdownify(self):
"""When both readabilipy modes return too little, falls back to raw markdownify."""
html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"

with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
# Both modes return empty/minimal content
mock_readability.return_value = {"content": ""}
result = extract_content_from_html(html)
# Should fall through to markdownify on raw HTML
assert "important data" in result
assert mock_readability.call_count == 2 # called for both modes

def test_completely_empty_html_returns_error(self):
"""Completely empty HTML returns error message."""
with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
mock_readability.return_value = {"content": ""}
result = extract_content_from_html("")
assert "<error>" in result

def test_readability_none_content_triggers_fallback(self):
"""When Readability returns None content, fallback is triggered."""
html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"

call_count = [0]
def mock_simple_json(h, use_readability=True):
call_count[0] += 1
if call_count[0] == 1:
return {"content": None} # Readability returns None
else:
return {"content": "<div>" + "<p>real content </p>" * 200 + "</div>"}

with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
result = extract_content_from_html(html)
assert "real content" in result

def test_threshold_is_one_percent_of_html(self):
"""The fallback threshold is 1% of the input HTML length."""
# 10000 chars of HTML -> threshold is 100 chars
padding = "x" * 9000
html = f"<html><body><div style=\"visibility:hidden\">{padding}</div><p>tiny</p></body></html>"

def mock_simple_json(h, use_readability=True):
if use_readability:
# Returns less than 1% of input
return {"content": "<p>tiny</p>"}
else:
return {"content": f"<div>{padding}</div><p>tiny</p>"}

with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
result = extract_content_from_html(html)
# Should have triggered fallback since "tiny" (4 chars) < 100 (1% threshold)
assert len(result.strip()) > 50
Loading