Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/fetch/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
]
dependencies = [
"charset-normalizer>=3.0.0",
"httpx>=0.27",
"markdownify>=0.13.1",
"mcp>=1.1.3",
Expand Down
28 changes: 19 additions & 9 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Annotated, Tuple
from urllib.parse import urlparse, urlunparse

import httpx
import markdownify
import readabilipy.simple_json
from charset_normalizer import from_bytes
from mcp.shared.exceptions import McpError
from mcp.server import Server
from mcp.server.stdio import stdio_server
Expand All @@ -24,6 +26,18 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"


def get_response_text(response: httpx.Response) -> str:
"""Return decoded text from an httpx response, with automatic encoding
detection via charset_normalizer for pages that don't declare charset
in the HTTP Content-Type header."""
if response.charset_encoding is not None:
return response.text
result = from_bytes(response.content).best()
if result is not None:
return str(result)
return response.text


def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.

Expand Down Expand Up @@ -68,18 +82,16 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
Check if the URL can be fetched by the user agent according to the robots.txt file.
Raises a McpError if not.
"""
from httpx import AsyncClient, HTTPError

robot_txt_url = get_robots_txt_url(url)

async with AsyncClient(proxy=proxy_url) as client:
async with httpx.AsyncClient(proxy=proxy_url) as client:
try:
response = await client.get(
robot_txt_url,
follow_redirects=True,
headers={"User-Agent": user_agent},
)
except HTTPError:
except httpx.HTTPError:
raise McpError(ErrorData(
code=INTERNAL_ERROR,
message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
Expand Down Expand Up @@ -114,25 +126,23 @@ async def fetch_url(
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
"""
from httpx import AsyncClient, HTTPError

async with AsyncClient(proxy=proxy_url) as client:
async with httpx.AsyncClient(proxy=proxy_url) as client:
try:
response = await client.get(
url,
follow_redirects=True,
headers={"User-Agent": user_agent},
timeout=30,
)
except HTTPError as e:
except httpx.HTTPError as e:
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}"))
if response.status_code >= 400:
raise McpError(ErrorData(
code=INTERNAL_ERROR,
message=f"Failed to fetch {url} - status code {response.status_code}",
))

page_raw = response.text
page_raw = get_response_text(response)

content_type = response.headers.get("content-type", "")
is_page_html = (
Expand Down
52 changes: 52 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Tests for the fetch MCP server."""

import httpx
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from mcp.shared.exceptions import McpError

from mcp_server_fetch.server import (
extract_content_from_html,
get_response_text,
get_robots_txt_url,
check_may_autonomously_fetch_url,
fetch_url,
Expand Down Expand Up @@ -324,3 +326,53 @@ async def test_fetch_with_proxy(self):

# Verify AsyncClient was called with proxy
mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")


def _build_response(raw_body: bytes, content_type: str = "text/html") -> httpx.Response:
"""Build a real httpx.Response without charset in the HTTP header."""
return httpx.Response(
status_code=200,
headers={"content-type": content_type},
content=raw_body,
)


class TestGetResponseText:
"""Tests for get_response_text with various non-UTF-8 encodings."""

def test_utf8_passthrough(self):
"""UTF-8 pages with charset declared in HTTP header use the standard path."""
text = "Hello World"
resp = httpx.Response(
200,
headers={"content-type": "text/html; charset=utf-8"},
content=text.encode("utf-8"),
)
assert get_response_text(resp) == text

def test_ukrainian_windows_1251(self):
text = "Київ це найбільше місто України із населенням понад три мільйони людей та є столицею нашої держави і культурним центром країни."
body = (
b"<html><body><p>" + text.encode("windows-1251") + b"</p></body></html>"
)
assert text in get_response_text(_build_response(body))

def test_hebrew_windows_1255(self):
text = "ירושלים היא הבירה של ישראל ועיר קדושה לשלוש הדתות המונותאיסטיות הגדולות העיר שוכנת בהרי יהודה ומהווה מרכז דתי ותרבותי חשוב."
body = (
b"<html><body><p>" + text.encode("windows-1255") + b"</p></body></html>"
)
assert text in get_response_text(_build_response(body))

def test_arabic_windows_1256(self):
text = "القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها تقع على ضفاف نهر النيل وتعتبر من أكبر المدن في الشرق الأوسط وأفريقيا."
body = (
b"<html><body><p>" + text.encode("windows-1256") + b"</p></body></html>"
)
assert text in get_response_text(_build_response(body))

def test_korean_euc_kr(self):
text = "서울특별시는 대한민국의 수도이자 최대 도시이다"
body = text.encode("utf-8")
resp = httpx.Response(200, headers={"content-type": "text/html"}, content=body)
assert text in get_response_text(resp)
4 changes: 3 additions & 1 deletion src/fetch/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading