diff --git a/src/fetch/README.md b/src/fetch/README.md index 2c3e048927..add58c2c11 100644 --- a/src/fetch/README.md +++ b/src/fetch/README.md @@ -24,6 +24,12 @@ The fetch tool will truncate the response, but by using the `start_index` argume - Arguments: - `url` (string, required): URL to fetch +### Content negotiation + +The fetch tool sends an `Accept: text/markdown, text/html;q=0.9, */*;q=0.8` request header, asking servers for native markdown when they support it. If the server responds with `Content-Type: text/markdown` (with or without a charset parameter), the response body is returned as-is and the HTML-to-markdown extraction step is skipped. Otherwise the existing readability + markdownify pipeline runs as before. + +This benefits sites that serve markdown directly via content negotiation — for example, Cloudflare-hosted sites with the [Markdown for Agents](https://developers.cloudflare.com/fundamentals/reference/markdown-for-agents/) feature enabled (the linked documentation page is itself such a site), content-negotiating CMSes, and raw-content endpoints. Servers that don't recognise the Accept header simply respond with whatever they normally would, so the change is fully backwards-compatible. + ## Installation Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..9dac6fff48 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -121,7 +121,10 @@ async def fetch_url( response = await client.get( url, follow_redirects=True, - headers={"User-Agent": user_agent}, + headers={ + "User-Agent": user_agent, + "Accept": "text/markdown, text/html;q=0.9, */*;q=0.8", + }, timeout=30, ) except HTTPError as e: @@ -135,6 +138,11 @@ async def fetch_url( page_raw = response.text content_type = response.headers.get("content-type", "") + + # Server provided markdown directly via content negotiation; skip HTML extraction. + if content_type.split(";", 1)[0].strip().lower() == "text/markdown": + return page_raw, "" + is_page_html = ( "