Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/fetch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ The fetch tool will truncate the response, but by using the `start_index` argume
- Arguments:
- `url` (string, required): URL to fetch

### Content negotiation

The fetch tool sends an `Accept: text/markdown, text/html;q=0.9, */*;q=0.8` request header, asking servers for native markdown when they support it. If the server responds with `Content-Type: text/markdown` (with or without a charset parameter), the response body is returned as-is and the HTML-to-markdown extraction step is skipped. Otherwise the existing readability + markdownify pipeline runs as before.

This benefits sites that serve markdown directly via content negotiation — for example, Cloudflare-hosted sites with the [Markdown for Agents](https://developers.cloudflare.com/fundamentals/reference/markdown-for-agents/) feature enabled (the linked documentation page is itself such a site), content-negotiating CMSes, and raw-content endpoints. Servers that don't recognise the Accept header simply respond with whatever they normally would, so the change is fully backwards-compatible.

## Installation

Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
Expand Down
10 changes: 9 additions & 1 deletion src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ async def fetch_url(
response = await client.get(
url,
follow_redirects=True,
headers={"User-Agent": user_agent},
headers={
"User-Agent": user_agent,
"Accept": "text/markdown, text/html;q=0.9, */*;q=0.8",
},
timeout=30,
)
except HTTPError as e:
Expand All @@ -135,6 +138,11 @@ async def fetch_url(
page_raw = response.text

content_type = response.headers.get("content-type", "")

# Server provided markdown directly via content negotiation; skip HTML extraction.
if content_type.split(";", 1)[0].strip().lower() == "text/markdown":
return page_raw, ""

is_page_html = (
"<html" in page_raw[:100] or "text/html" in content_type or not content_type
)
Expand Down
99 changes: 99 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,102 @@ async def test_fetch_with_proxy(self):

# Verify AsyncClient was called with proxy
mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")

@pytest.mark.asyncio
async def test_fetch_markdown_returns_early(self):
"""Test that text/markdown responses skip HTML extraction and return body as-is."""
md_content = "# Hello World\n\nThis is markdown."
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = md_content
mock_response.headers = {"content-type": "text/markdown"}

with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)

content, prefix = await fetch_url(
"https://example.com/readme.md",
DEFAULT_USER_AGENT_AUTONOMOUS
)

assert content == md_content
assert prefix == ""

@pytest.mark.asyncio
async def test_fetch_markdown_with_charset(self):
"""Test that text/markdown with a charset parameter is recognised as markdown."""
md_content = "# Title\n\nBody."
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = md_content
mock_response.headers = {"content-type": "text/markdown; charset=utf-8"}

with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)

content, prefix = await fetch_url(
"https://example.com/page.md",
DEFAULT_USER_AGENT_AUTONOMOUS
)

assert content == md_content
assert prefix == ""

@pytest.mark.asyncio
async def test_fetch_x_markdown_does_not_match(self):
"""Test that non-standard text/x-markdown is NOT treated as markdown and falls through."""
body = "some non-standard markdown variant"
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = body
mock_response.headers = {"content-type": "text/x-markdown"}

with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)

content, prefix = await fetch_url(
"https://example.com/page",
DEFAULT_USER_AGENT_AUTONOMOUS
)

# text/x-markdown is neither text/html nor text/markdown, so it falls
# into the raw-fallback branch with the "cannot be simplified" prefix.
# This pins the contract: only the standard text/markdown media type
# triggers the native-markdown short-circuit.
assert content == body
assert "cannot be simplified" in prefix

@pytest.mark.asyncio
async def test_fetch_sends_accept_header(self):
"""Test that fetch_url sends an Accept header advertising markdown preference."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = "# md"
mock_response.headers = {"content-type": "text/markdown"}

with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)

await fetch_url(
"https://example.com/page",
DEFAULT_USER_AGENT_AUTONOMOUS
)

call_kwargs = mock_client.get.call_args.kwargs
assert "Accept" in call_kwargs["headers"]
accept = call_kwargs["headers"]["Accept"]
assert "text/markdown" in accept
# Markdown should be preferred over HTML — appear earlier in the q-list.
assert accept.index("text/markdown") < accept.index("text/html")
Loading