Skip to content

Commit a2fcbbd

Browse files
committed
fix: export error.
1 parent 9d9cd22 commit a2fcbbd

2 files changed

Lines changed: 249 additions & 22 deletions

File tree

backend/app/routers/export.py

Lines changed: 114 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
1+
import base64
12
import io
3+
import mimetypes
24
import re
35
import urllib.parse
46
import zipfile
7+
from pathlib import Path
58

69
from fastapi import APIRouter, Depends, HTTPException, Query
710
from fastapi.responses import StreamingResponse, HTMLResponse
811

912
from app.auth import get_current_user, require_admin
13+
from app.config import settings
1014
from app.database import get_db
1115
from app.services.acl import resolve_page_permission
1216

@@ -18,6 +22,43 @@ def _sanitize_url(url: str) -> str:
1822
return "about:blank"
1923
return url
2024

25+
26+
# Matches `src="/api/media/<name>"` (optionally with a scheme+host prefix or
27+
# query/fragment). Only attribute-form srcs — URLs in user text aren't rewritten.
28+
_MEDIA_SRC_PATTERN = re.compile(
29+
r'(?P<prefix>src=")'
30+
r'(?:https?://[^/"]+)?'
31+
r'/api/media/'
32+
r'(?P<filename>[^"?#]+)'
33+
r'(?:[?#][^"]*)?"'
34+
)
35+
36+
37+
def _inline_media_srcs(html: str) -> str:
38+
"""Rewrite ``/api/media/<file>`` image srcs into ``data:`` URIs.
39+
40+
Without this, downloaded HTML opened via ``file://`` and print-dialog
41+
PDFs saved from the browser can't resolve the media URLs, so embedded
42+
images silently disappear from the export. Inlining keeps the artifact
43+
self-contained.
44+
"""
45+
media_dir = Path(settings.MEDIA_DIR).resolve()
46+
47+
def _replace(match: re.Match) -> str:
48+
filename = match.group("filename")
49+
try:
50+
filepath = (media_dir / filename).resolve()
51+
if not filepath.is_relative_to(media_dir) or not filepath.is_file():
52+
return match.group(0)
53+
data = filepath.read_bytes()
54+
except OSError:
55+
return match.group(0)
56+
mime = mimetypes.guess_type(str(filepath))[0] or "application/octet-stream"
57+
b64 = base64.b64encode(data).decode("ascii")
58+
return f'{match.group("prefix")}data:{mime};base64,{b64}"'
59+
60+
return _MEDIA_SRC_PATTERN.sub(_replace, html)
61+
2162
router = APIRouter(prefix="/api/export", tags=["export"])
2263

2364
HTML_TEMPLATE = """<!DOCTYPE html>
@@ -27,20 +68,26 @@ def _sanitize_url(url: str) -> str:
2768
<meta name="viewport" content="width=device-width, initial-scale=1.0">
2869
<title>{title}</title>
2970
<style>
30-
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 800px; margin: 2rem auto; padding: 0 1rem; color: #1e293b; line-height: 1.7; }}
31-
h1 {{ border-bottom: 2px solid #e2e8f0; padding-bottom: 0.3em; }}
32-
h2 {{ border-bottom: 1px solid #e2e8f0; padding-bottom: 0.3em; }}
33-
pre {{ background: #1e293b; color: #e2e8f0; padding: 1em; border-radius: 8px; overflow-x: auto; }}
34-
code {{ background: #f1f5f9; padding: 0.15em 0.4em; border-radius: 3px; font-size: 0.9em; }}
35-
pre code {{ background: none; padding: 0; color: inherit; }}
36-
blockquote {{ border-left: 4px solid #e2e8f0; padding-left: 1em; color: #64748b; margin: 0.5em 0; }}
71+
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'PingFang TC', 'Noto Sans CJK TC', sans-serif; max-width: 820px; margin: 2rem auto; padding: 0 1.25rem; color: #1e293b; line-height: 1.7; background: #ffffff; }}
72+
h1, h2, h3, h4, h5, h6 {{ margin: 1.4em 0 0.6em; font-weight: 600; line-height: 1.3; }}
73+
h1 {{ font-size: 2em; border-bottom: 2px solid #e2e8f0; padding-bottom: 0.3em; }}
74+
h2 {{ font-size: 1.5em; border-bottom: 1px solid #e2e8f0; padding-bottom: 0.3em; }}
75+
h3 {{ font-size: 1.25em; }}
76+
p {{ margin: 0.5em 0; }}
77+
ul, ol {{ padding-left: 1.6em; margin: 0.5em 0; }}
78+
li {{ margin: 0.25em 0; }}
79+
pre {{ background: #1e293b; color: #e2e8f0; padding: 1em; border-radius: 8px; overflow-x: auto; font-size: 0.9em; }}
80+
code {{ background: #f1f5f9; padding: 0.15em 0.4em; border-radius: 3px; font-size: 0.9em; font-family: 'SF Mono', Menlo, Consolas, monospace; }}
81+
pre code {{ background: none; padding: 0; color: inherit; font-size: 1em; }}
82+
blockquote {{ border-left: 4px solid #e2e8f0; padding-left: 1em; color: #64748b; margin: 0.75em 0; }}
3783
table {{ border-collapse: collapse; width: 100%; margin: 0.75em 0; }}
3884
th, td {{ border: 1px solid #e2e8f0; padding: 0.5em 0.75em; text-align: left; }}
3985
th {{ background: #f8fafc; font-weight: 600; }}
40-
img {{ max-width: 100%; }}
41-
a {{ color: #2563eb; }}
86+
img {{ max-width: 100%; border-radius: 6px; display: block; margin: 0.5em 0; }}
87+
a {{ color: #2563eb; text-decoration: underline; }}
88+
a.wikilink {{ color: #2563eb; text-decoration: none; border-bottom: 1px dashed #93c5fd; }}
4289
hr {{ border: none; border-top: 2px solid #e2e8f0; margin: 1.5em 0; }}
43-
.callout {{ border: 1px solid; border-radius: 8px; padding: 0.75em 1em; margin: 1em 0; }}
90+
.callout {{ border: 1px solid; border-left-width: 4px; border-radius: 6px; padding: 0.75em 1em; margin: 1em 0; }}
4491
.callout-info {{ border-color: #bfdbfe; background: #f0f7ff; }}
4592
.callout-warning {{ border-color: #fde68a; background: #fffdf5; }}
4693
.callout-tip {{ border-color: #a7f3d0; background: #f0fdf8; }}
@@ -55,6 +102,33 @@ def _sanitize_url(url: str) -> str:
55102
</body>
56103
</html>"""
57104

105+
PDF_PRINT_CSS = """
106+
.print-hint { background: #fef3c7; border: 1px solid #fde68a; border-radius: 8px; padding: 0.75em 1em; margin-bottom: 1.5em; color: #78350f; font-size: 0.9em; }
107+
.print-hint kbd { background: #fffbeb; border: 1px solid #fcd34d; border-radius: 4px; padding: 0 0.35em; font-family: inherit; font-size: 0.85em; }
108+
@media print {
109+
body { margin: 0; max-width: 100%; }
110+
@page { margin: 1.5cm; }
111+
.print-hint { display: none; }
112+
}
113+
"""
114+
115+
PDF_HINT_BANNER = (
116+
'<div class="print-hint">'
117+
'已自動開啟列印對話框 — 若未彈出,請按 '
118+
'<kbd>Ctrl</kbd>+<kbd>P</kbd>(Windows)或 '
119+
'<kbd>⌘</kbd>+<kbd>P</kbd>(Mac),'
120+
'在印表機選擇「另存為 PDF / Save as PDF」即可下載。'
121+
'</div>'
122+
)
123+
124+
PDF_AUTO_PRINT_SCRIPT = (
125+
"<script>"
126+
"window.addEventListener('load', function () { "
127+
"setTimeout(function () { window.print(); }, 150); "
128+
"});"
129+
"</script>"
130+
)
131+
58132
SITE_INDEX_TEMPLATE = """<!DOCTYPE html>
59133
<html lang="zh-TW">
60134
<head>
@@ -102,6 +176,20 @@ def save_block(m):
102176
for i, b in enumerate(blocks):
103177
html = html.replace(f"%%BLOCK_{i}%%", b)
104178

179+
# Inline code — stash before other transforms so content can't be mangled
180+
# (e.g. `**x**` inside code must stay literal). Double-backtick form first so
181+
# an escaped backtick inside ``…`` doesn't desync single-backtick pairing —
182+
# otherwise every `code` span later in the document gets flipped (</code>
183+
# where <code> was expected), which is how the 部署方式 section broke.
184+
inline_codes = []
185+
def save_inline(content):
186+
idx = len(inline_codes)
187+
inline_codes.append(content)
188+
return f"%%INLINE_{idx}%%"
189+
190+
html = re.sub(r"``\s*(.+?)\s*``", lambda m: save_inline(m.group(1)), html)
191+
html = re.sub(r"`([^`\n]+)`", lambda m: save_inline(m.group(1)), html)
192+
105193
# Headers
106194
for i in range(6, 0, -1):
107195
html = re.sub(rf"^{'#' * i}\s+(.+)$", rf"<h{i}>\1</h{i}>", html, flags=re.MULTILINE)
@@ -114,9 +202,6 @@ def save_block(m):
114202
html = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", html)
115203
html = re.sub(r"\*(.+?)\*", r"<em>\1</em>", html)
116204

117-
# Inline code
118-
html = re.sub(r"`([^`]+)`", r"<code>\1</code>", html)
119-
120205
# Wikilinks
121206
html = re.sub(r"\[\[([^\]|]+?)\|([^\]]+?)\]\]", r'<a href="\1.html">\2</a>', html)
122207
html = re.sub(r"\[\[([^\]|]+?)\]\]", r'<a href="\1.html">\1</a>', html)
@@ -173,6 +258,10 @@ def parse_table(m):
173258
# Paragraphs
174259
html = re.sub(r"^(?!<[a-z/])((?!\s*$).+)$", r"<p>\1</p>", html, flags=re.MULTILINE)
175260

261+
# Restore stashed inline code
262+
for i, content in enumerate(inline_codes):
263+
html = html.replace(f"%%INLINE_{i}%%", f"<code>{content}</code>")
264+
176265
return html
177266

178267

@@ -193,21 +282,24 @@ async def export_page(
193282
page = dict(rows[0])
194283
if await resolve_page_permission(db, user, page["id"]) == "none":
195284
raise HTTPException(status_code=404, detail="Page not found")
196-
html_content = md_to_simple_html(page["content_md"])
285+
html_content = _inline_media_srcs(md_to_simple_html(page["content_md"]))
197286
full_html = HTML_TEMPLATE.format(
198287
title=page["title"],
199288
slug=page["slug"],
200289
content=html_content,
201290
)
202291

203292
if format == "pdf":
204-
# Return HTML with print-friendly styling — browser can use Ctrl+P
205-
pdf_html = full_html.replace("</style>", """
206-
@media print {
207-
body { margin: 0; max-width: 100%; }
208-
@page { margin: 1.5cm; }
209-
}
210-
</style>""")
293+
# Browser-print approach: inject print CSS + user hint banner +
294+
# auto-trigger window.print() on load. The user picks "Save as PDF"
295+
# from the print dialog. Response stays HTML (filename .html) — we
296+
# don't claim to return a real PDF.
297+
pdf_html = (
298+
full_html
299+
.replace("</style>", PDF_PRINT_CSS + "</style>", 1)
300+
.replace("<body>", "<body>" + PDF_HINT_BANNER, 1)
301+
.replace("</body>", PDF_AUTO_PRINT_SCRIPT + "</body>", 1)
302+
)
211303
return HTMLResponse(content=pdf_html, headers={
212304
"Content-Disposition": f'inline; filename="{slug}.html"',
213305
})
@@ -238,7 +330,7 @@ async def export_site(
238330
page_links = []
239331
for p in pages:
240332
page = dict(p)
241-
html_content = md_to_simple_html(page["content_md"])
333+
html_content = _inline_media_srcs(md_to_simple_html(page["content_md"]))
242334
full_html = HTML_TEMPLATE.format(
243335
title=page["title"],
244336
slug=page["slug"],

backend/tests/test_export.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1+
import base64
2+
from pathlib import Path
3+
14
import pytest
25

6+
from app.config import settings
7+
38
@pytest.mark.asyncio
49
async def test_export_page(auth_client):
510
# Create page
@@ -14,6 +19,136 @@ async def test_export_page(auth_client):
1419
assert response.headers["Content-Type"] == "text/html; charset=utf-8"
1520
assert b"Export content" in response.content
1621

22+
@pytest.mark.asyncio
23+
async def test_export_page_pdf_auto_prints(auth_client):
24+
"""format=pdf returns HTML that auto-opens the browser print dialog."""
25+
await auth_client.post("/api/pages", json={
26+
"title": "PDF Page",
27+
"content_md": "PDF content body",
28+
"slug": "pdf-page",
29+
})
30+
31+
response = await auth_client.get("/api/export/page/pdf-page?format=pdf")
32+
assert response.status_code == 200
33+
assert response.headers["Content-Type"] == "text/html; charset=utf-8"
34+
# inline (browser displays it) rather than attachment (which would download
35+
# a useless .html pretending to be a PDF).
36+
assert "inline" in response.headers["content-disposition"]
37+
38+
body = response.text
39+
assert "PDF content body" in body
40+
assert "window.print()" in body
41+
assert "print-hint" in body
42+
# Print CSS hides the hint banner in the rendered PDF.
43+
assert "@media print" in body
44+
assert ".print-hint { display: none; }" in body
45+
46+
47+
@pytest.mark.asyncio
48+
async def test_export_page_html_has_no_auto_print(auth_client):
49+
"""format=html must NOT auto-print — it's a plain download."""
50+
await auth_client.post("/api/pages", json={
51+
"title": "Html Only",
52+
"content_md": "Plain html export",
53+
"slug": "html-only",
54+
})
55+
56+
response = await auth_client.get("/api/export/page/html-only?format=html")
57+
assert response.status_code == 200
58+
assert "attachment" in response.headers["content-disposition"]
59+
assert "window.print()" not in response.text
60+
assert "print-hint" not in response.text
61+
62+
63+
@pytest.mark.asyncio
64+
async def test_export_page_pdf_content_not_double_injected(auth_client):
65+
"""Page content containing '<body>' literal must not trigger re-injection.
66+
67+
md_to_simple_html escapes `<` and `>`, so user content cannot smuggle real
68+
`<body>` / `</style>` tags into the template — but guard the assumption
69+
with a test so regressions in the markdown escaping are caught here.
70+
"""
71+
await auth_client.post("/api/pages", json={
72+
"title": "Tricky",
73+
"content_md": "Body tag test: <body> </style> </body>",
74+
"slug": "tricky-tags",
75+
})
76+
77+
response = await auth_client.get("/api/export/page/tricky-tags?format=pdf")
78+
assert response.status_code == 200
79+
body = response.text
80+
# Exactly one auto-print script and one hint banner.
81+
assert body.count("window.print()") == 1
82+
assert body.count('class="print-hint"') == 1
83+
84+
85+
@pytest.mark.asyncio
86+
async def test_export_page_inlines_media_images(auth_client):
87+
"""Images served from /api/media must be embedded as data URIs so the
88+
exported file stays self-contained when opened via file:// or saved as PDF.
89+
"""
90+
# 1x1 PNG, placed directly in MEDIA_DIR to bypass the auth-gated upload
91+
# endpoint (which would require bouncing through a multipart request).
92+
png_bytes = base64.b64decode(
93+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII="
94+
)
95+
filename = "export-test-pixel.png"
96+
filepath = Path(settings.MEDIA_DIR) / filename
97+
filepath.write_bytes(png_bytes)
98+
99+
try:
100+
await auth_client.post("/api/pages", json={
101+
"title": "With Image",
102+
"content_md": f"Has image: ![pixel](/api/media/{filename})",
103+
"slug": "with-image",
104+
})
105+
106+
response = await auth_client.get("/api/export/page/with-image")
107+
assert response.status_code == 200
108+
body = response.text
109+
expected = "data:image/png;base64," + base64.b64encode(png_bytes).decode()
110+
assert expected in body
111+
# Raw /api/media path must not leak through in the exported src.
112+
assert f'src="/api/media/{filename}"' not in body
113+
114+
# PDF export goes through the same inlining path.
115+
pdf_response = await auth_client.get(
116+
"/api/export/page/with-image?format=pdf"
117+
)
118+
assert expected in pdf_response.text
119+
finally:
120+
filepath.unlink(missing_ok=True)
121+
122+
123+
@pytest.mark.asyncio
124+
async def test_export_double_backtick_doesnt_desync_inline_code(auth_client):
125+
"""`` `x` `` (double-backtick escape for a backtick inside inline code) must
126+
not flip <code>/</code> tags for later inline code in the same document.
127+
128+
Regression: the original `([^`]+)` regex paired backticks left-to-right,
129+
so ``| `` `x` `` | `y` |`` left stray backticks that inverted every
130+
subsequent `<code>` span — e.g. `.env` rendered as `</code>.env`.
131+
"""
132+
await auth_client.post("/api/pages", json={
133+
"title": "Backtick Escape",
134+
"content_md": (
135+
"Escape demo: `` `inner` ``\n\n"
136+
"Later inline: `./data` and `.env`\n"
137+
),
138+
"slug": "backtick-escape",
139+
})
140+
response = await auth_client.get("/api/export/page/backtick-escape")
141+
assert response.status_code == 200
142+
body = response.text
143+
144+
# The later inline codes must render with correctly ordered tags.
145+
assert "<code>./data</code>" in body
146+
assert "<code>.env</code>" in body
147+
# And no flipped tags should leak out.
148+
assert "</code>./data" not in body
149+
assert "</code>.env" not in body
150+
151+
17152
@pytest.mark.asyncio
18153
async def test_export_site_requires_admin(auth_client):
19154
# Non-admins should be blocked — site exports would otherwise silently

0 commit comments

Comments
 (0)