Skip to content

Commit 476dff3

Browse files
committed
fix(fetch): fall back without readability js
1 parent b1e1eb1 commit 476dff3

4 files changed

Lines changed: 125 additions & 8 deletions

File tree

src/fetch/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume
2626

2727
## Installation
2828

29-
Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
29+
Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. If node.js is not available, the server falls back to readabilipy's Python-only HTML simplifier.
3030

3131
### Using uv (recommended)
3232

@@ -170,6 +170,10 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th
170170

171171
The server can be configured to use a proxy by using the `--proxy-url` argument.
172172

173+
### Customization - HTML simplification
174+
175+
By default, the server uses readabilipy's optional node.js simplifier when node.js is available, and otherwise falls back to the Python-only simplifier. To force the Python-only path even when node.js is installed, add the `--no-readability-js` argument.
176+
173177
## Windows Configuration
174178

175179
If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding:

src/fetch/src/mcp_server_fetch/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,21 @@ def main():
1616
help="Ignore robots.txt restrictions",
1717
)
1818
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
19+
parser.add_argument(
20+
"--no-readability-js",
21+
action="store_true",
22+
help="Use readabilipy's Python-only HTML simplifier even when Node.js is installed",
23+
)
1924

2025
args = parser.parse_args()
21-
asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
26+
asyncio.run(
27+
serve(
28+
custom_user_agent=args.user_agent,
29+
ignore_robots_txt=args.ignore_robots_txt,
30+
proxy_url=args.proxy_url,
31+
use_readability_js=not args.no_readability_js,
32+
)
33+
)
2234

2335

2436
if __name__ == "__main__":

src/fetch/src/mcp_server_fetch/server.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import shutil
12
from typing import Annotated, Tuple
23
from urllib.parse import urlparse, urlunparse
34

@@ -24,7 +25,11 @@
2425
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
2526

2627

27-
def extract_content_from_html(html: str) -> str:
28+
def _can_use_readability_js(use_readability_js: bool) -> bool:
29+
return use_readability_js and shutil.which("node") is not None
30+
31+
32+
def extract_content_from_html(html: str, use_readability_js: bool = True) -> str:
2833
"""Extract and convert HTML content to Markdown format.
2934
3035
Args:
@@ -34,7 +39,7 @@ def extract_content_from_html(html: str) -> str:
3439
Simplified markdown version of the content
3540
"""
3641
ret = readabilipy.simple_json.simple_json_from_html_string(
37-
html, use_readability=True
42+
html, use_readability=_can_use_readability_js(use_readability_js)
3843
)
3944
if not ret["content"]:
4045
return "<error>Page failed to be simplified from HTML</error>"
@@ -109,7 +114,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
109114

110115

111116
async def fetch_url(
112-
url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
117+
url: str,
118+
user_agent: str,
119+
force_raw: bool = False,
120+
proxy_url: str | None = None,
121+
use_readability_js: bool = True,
113122
) -> Tuple[str, str]:
114123
"""
115124
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
@@ -140,7 +149,9 @@ async def fetch_url(
140149
)
141150

142151
if is_page_html and not force_raw:
143-
return extract_content_from_html(page_raw), ""
152+
return extract_content_from_html(
153+
page_raw, use_readability_js=use_readability_js
154+
), ""
144155

145156
return (
146157
page_raw,
@@ -182,13 +193,15 @@ async def serve(
182193
custom_user_agent: str | None = None,
183194
ignore_robots_txt: bool = False,
184195
proxy_url: str | None = None,
196+
use_readability_js: bool = True,
185197
) -> None:
186198
"""Run the fetch MCP server.
187199
188200
Args:
189201
custom_user_agent: Optional custom User-Agent string to use for requests
190202
ignore_robots_txt: Whether to ignore robots.txt restrictions
191203
proxy_url: Optional proxy URL to use for requests
204+
use_readability_js: Whether to use readabilipy's optional Node.js simplifier
192205
"""
193206
server = Server("mcp-fetch")
194207
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
@@ -235,7 +248,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
235248
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
236249

237250
content, prefix = await fetch_url(
238-
url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
251+
url,
252+
user_agent_autonomous,
253+
force_raw=args.raw,
254+
proxy_url=proxy_url,
255+
use_readability_js=use_readability_js,
239256
)
240257
original_length = len(content)
241258
if args.start_index >= original_length:
@@ -262,7 +279,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
262279
url = arguments["url"]
263280

264281
try:
265-
content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
282+
content, prefix = await fetch_url(
283+
url,
284+
user_agent_manual,
285+
proxy_url=proxy_url,
286+
use_readability_js=use_readability_js,
287+
)
266288
# TODO: after SDK bug is addressed, don't catch the exception
267289
except McpError as e:
268290
return GetPromptResult(

src/fetch/tests/test_server.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,54 @@ def test_empty_content_returns_error(self):
8787
result = extract_content_from_html(html)
8888
assert "<error>" in result
8989

90+
def test_uses_readability_js_when_node_is_available(self):
91+
html = "<html><body><article><p>Hello</p></article></body></html>"
92+
93+
with (
94+
patch("mcp_server_fetch.server.shutil.which", return_value="node"),
95+
patch(
96+
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
97+
) as mock_simple_json,
98+
):
99+
mock_simple_json.return_value = {"content": "<p>Hello</p>"}
100+
101+
result = extract_content_from_html(html)
102+
103+
mock_simple_json.assert_called_once_with(html, use_readability=True)
104+
assert "Hello" in result
105+
106+
def test_falls_back_without_node(self):
107+
html = "<html><body><article><p>Hello</p></article></body></html>"
108+
109+
with (
110+
patch("mcp_server_fetch.server.shutil.which", return_value=None),
111+
patch(
112+
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
113+
) as mock_simple_json,
114+
):
115+
mock_simple_json.return_value = {"content": "<p>Hello</p>"}
116+
117+
result = extract_content_from_html(html)
118+
119+
mock_simple_json.assert_called_once_with(html, use_readability=False)
120+
assert "Hello" in result
121+
122+
def test_can_disable_readability_js(self):
123+
html = "<html><body><article><p>Hello</p></article></body></html>"
124+
125+
with (
126+
patch("mcp_server_fetch.server.shutil.which", return_value="node"),
127+
patch(
128+
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
129+
) as mock_simple_json,
130+
):
131+
mock_simple_json.return_value = {"content": "<p>Hello</p>"}
132+
133+
result = extract_content_from_html(html, use_readability_js=False)
134+
135+
mock_simple_json.assert_called_once_with(html, use_readability=False)
136+
assert "Hello" in result
137+
90138

91139
class TestCheckMayAutonomouslyFetchUrl:
92140
"""Tests for check_may_autonomously_fetch_url function."""
@@ -219,6 +267,37 @@ async def test_fetch_html_page(self):
219267
assert isinstance(content, str)
220268
assert prefix == ""
221269

270+
@pytest.mark.asyncio
271+
async def test_fetch_html_forwards_readability_js_option(self):
272+
"""Test that fetch_url forwards the readability JS option."""
273+
html_content = "<html><body><h1>Test</h1></body></html>"
274+
mock_response = MagicMock()
275+
mock_response.status_code = 200
276+
mock_response.text = html_content
277+
mock_response.headers = {"content-type": "text/html"}
278+
279+
with (
280+
patch("httpx.AsyncClient") as mock_client_class,
281+
patch(
282+
"mcp_server_fetch.server.extract_content_from_html",
283+
return_value="Test",
284+
) as mock_extract,
285+
):
286+
mock_client = AsyncMock()
287+
mock_client.get = AsyncMock(return_value=mock_response)
288+
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
289+
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
290+
291+
content, prefix = await fetch_url(
292+
"https://example.com/page",
293+
DEFAULT_USER_AGENT_AUTONOMOUS,
294+
use_readability_js=False,
295+
)
296+
297+
mock_extract.assert_called_once_with(html_content, use_readability_js=False)
298+
assert content == "Test"
299+
assert prefix == ""
300+
222301
@pytest.mark.asyncio
223302
async def test_fetch_html_page_raw(self):
224303
"""Test fetching an HTML page with raw=True returns original HTML."""

0 commit comments

Comments
 (0)