Skip to content

Commit a9fc3bb

Browse files
committed
fix(fetch): strip script/style tags before markdownify in Stage 3 fallback
1 parent a5ae267 commit a9fc3bb

1 file changed

Lines changed: 8 additions & 2 deletions

File tree

src/fetch/src/mcp_server_fetch/server.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import markdownify
55
import readabilipy.simple_json
6+
from bs4 import BeautifulSoup
67
from mcp.shared.exceptions import McpError
78
from mcp.server import Server
89
from mcp.server.stdio import stdio_server
@@ -71,9 +72,14 @@ def extract_content_from_html(html: str) -> str:
7172
if len(content.strip()) >= min_expected_length:
7273
return content
7374

74-
# Stage 3: Convert full HTML directly with markdownify (last resort)
75+
# Stage 3: Convert full HTML directly with markdownify (last resort).
76+
# Strip <script> and <style> first — markdownify renders them verbatim as
77+
# plain text, which injects large blobs of JS/CSS noise into the output.
78+
soup = BeautifulSoup(html, "html.parser")
79+
for tag in soup(["script", "style"]):
80+
tag.decompose()
7581
content = markdownify.markdownify(
76-
html,
82+
str(soup),
7783
heading_style=markdownify.ATX,
7884
)
7985
if content.strip():

0 commit comments

Comments
 (0)