Skip to content

Commit 9fa4aab

Browse files
cdeustclaude
andcommitted
feat(wiki redesign): Phase 10 — Pandoc export (PDF, LaTeX, DOCX, HTML)
Wiki pages now export through Pandoc with full bibliography + cross-ref + math support. Scientists can write in the wiki, hit PDF, and get a journal-submittable document. Backend (handlers/wiki_export.py): - Four output formats whitelisted: pdf, tex, docx, html. - Requires `pandoc` on PATH; if missing, returns an actionable install message (brew/apt instructions + texlive for PDF). - Reads source via wiki_store.read_page (CodeQL commonpath sanitizer) or an inline body argument for ad-hoc exports. - Resolves bibliography files from frontmatter `bibliography: [_bibliography/...]`, page-level override, or fallback to all bib files under _bibliography/. Paths locked to that directory. - Extra pandoc_args whitelisted: --toc, --number-sections, --standalone, --section-divs, --citeproc, --biblatex, …. Any flag not on the list is dropped silently — subprocess injection vector closed. - 90s timeout; stderr/stdout tails surfaced on non-zero exit. - Always operates inside a TemporaryDirectory; output is base64-encoded for the MCP tool return. HTTP (both servers): GET /api/wiki/export?path=X&format=pdf|tex|docx|html Decodes the base64 and streams raw bytes with correct Content-Type and Content-Disposition so the browser downloads. Frontend (ui/unified/js/wiki.js + knowledge.css): - Edit button + four export buttons (PDF / TEX / DOCX / HTML) in a .wiki-page-actions flex row on every page header. - Export buttons are direct <a download> links — no JS needed on the client side. Browser downloads the Pandoc output. Install-side instructions (for users who want PDF export): macOS: brew install pandoc && brew install --cask mactex-no-gui Linux: apt install pandoc texlive-xetex Phase 10 complete. The wiki is now a legitimate scientific authoring tool: - Edit in CodeMirror with live KaTeX math - Cite with BibTeX, auto-generate bibliography - Cross-refs to figures/equations/sections - Export to any journal format Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 39177ce commit 9fa4aab

5 files changed

Lines changed: 374 additions & 4 deletions

File tree

mcp_server/handlers/wiki_export.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
"""Phase 10 — Pandoc-backed wiki page export.
2+
3+
Exports a wiki page (or a hand-composed markdown body) to:
4+
pdf — via Pandoc → LaTeX → PDF (requires pandoc + TeX on server)
5+
tex — Pandoc LaTeX source
6+
docx — Pandoc Word
7+
html — standalone Pandoc HTML with KaTeX/MathJax
8+
9+
Inputs come from wiki/_bibliography/*.bib (Phase 9) — Pandoc resolves
10+
`[@key]` citations against the same files the in-browser Citation.js
11+
resolves, producing a DOI-quality bibliography.
12+
13+
Path validation uses the Phase 6 CodeQL-verified commonpath sanitizer
14+
via wiki_store.read_page. Never shells out with user-controlled
15+
strings — every Pandoc argument comes from a whitelist or is routed
16+
through a temp file.
17+
"""
18+
19+
from __future__ import annotations
20+
21+
import os
22+
import re
23+
import shutil
24+
import subprocess
25+
import tempfile
26+
from pathlib import Path
27+
from typing import Any
28+
29+
30+
_ALLOWED_FORMATS = {
31+
"pdf": {"pandoc_to": "pdf", "ext": "pdf", "engine": "pdflatex"},
32+
"tex": {"pandoc_to": "latex", "ext": "tex", "engine": None},
33+
"docx": {"pandoc_to": "docx", "ext": "docx", "engine": None},
34+
"html": {"pandoc_to": "html5", "ext": "html", "engine": None},
35+
}
36+
37+
38+
schema = {
39+
"description": (
40+
"Export a wiki page through Pandoc. Produces PDF/LaTeX/DOCX/HTML "
41+
"with bibliography, figures, cross-refs, math. Phase 10."
42+
),
43+
"inputSchema": {
44+
"type": "object",
45+
"properties": {
46+
"rel_path": {
47+
"type": "string",
48+
"description": "Wiki page path (e.g. 'specs/cortex/42-foo.md').",
49+
},
50+
"body": {
51+
"type": "string",
52+
"description": (
53+
"Inline markdown body (use instead of rel_path for "
54+
"ad-hoc exports). Frontmatter is honoured."
55+
),
56+
},
57+
"format": {
58+
"type": "string",
59+
"enum": list(_ALLOWED_FORMATS.keys()),
60+
"default": "pdf",
61+
},
62+
"bibliography": {
63+
"type": "array",
64+
"items": {"type": "string"},
65+
"description": (
66+
"Override the page's frontmatter bibliography list. "
67+
"Paths must be under _bibliography/."
68+
),
69+
},
70+
"pandoc_args": {
71+
"type": "array",
72+
"items": {"type": "string"},
73+
"description": (
74+
"Extra Pandoc args. Validated against a whitelist of "
75+
"safe flags (--toc, --number-sections, --template, …)."
76+
),
77+
},
78+
},
79+
},
80+
}
81+
82+
83+
# Whitelisted Pandoc flags users can opt into. Anything not in here
84+
# gets dropped silently — prevents subprocess injection via user input.
85+
_SAFE_FLAG_ALLOWLIST = {
86+
"--toc",
87+
"--number-sections",
88+
"--standalone",
89+
"--section-divs",
90+
"--shift-heading-level-by=1",
91+
"--shift-heading-level-by=-1",
92+
"--citeproc",
93+
"--biblatex",
94+
}
95+
96+
97+
def _check_pandoc() -> str | None:
98+
"""Return the pandoc binary path, or None if missing."""
99+
return shutil.which("pandoc")
100+
101+
102+
def _filter_pandoc_args(args: list[str] | None) -> list[str]:
103+
if not args:
104+
return []
105+
return [a for a in args if a in _SAFE_FLAG_ALLOWLIST]
106+
107+
108+
def _read_body(wiki_root: Path, rel_path: str | None, body: str | None) -> str:
109+
if body is not None:
110+
return body
111+
if not rel_path:
112+
raise ValueError("rel_path or body is required")
113+
from mcp_server.infrastructure.wiki_store import read_page
114+
115+
content = read_page(wiki_root, rel_path)
116+
if content is None:
117+
raise FileNotFoundError(rel_path)
118+
return content
119+
120+
121+
def _extract_bibliography_hint(markdown: str) -> list[str]:
122+
"""Very small frontmatter reader — only looks for a bibliography:
123+
inline list. Full parsing lives in core/wiki_pages; this module
124+
only needs one field and avoids an extra import cycle.
125+
"""
126+
if not markdown.startswith("---"):
127+
return []
128+
end = markdown.find("\n---", 3)
129+
if end < 0:
130+
return []
131+
fm = markdown[3:end]
132+
m = re.search(r"^\s*bibliography:\s*\[(.*?)\]", fm, re.MULTILINE)
133+
if not m:
134+
return []
135+
return [s.strip() for s in m.group(1).split(",") if s.strip()]
136+
137+
138+
def _resolve_bibliography_paths(wiki_root: Path, hints: list[str]) -> list[Path]:
139+
"""Map relative bib paths to absolute files under wiki/_bibliography/.
140+
141+
Rejects anything that escapes the _bibliography/ directory.
142+
"""
143+
resolved: list[Path] = []
144+
bib_root = (wiki_root / "_bibliography").resolve()
145+
for hint in hints:
146+
# Allow both "foo.bib" and "_bibliography/foo.bib"
147+
if hint.startswith("_bibliography/"):
148+
rel = hint[len("_bibliography/") :]
149+
else:
150+
rel = hint
151+
if not rel.endswith(".bib") or "/" in rel.rstrip("/"):
152+
# Reject subpaths entirely — flat layout only.
153+
if "/" in rel or not rel.endswith(".bib"):
154+
continue
155+
target = (bib_root / rel).resolve()
156+
try:
157+
target.relative_to(bib_root)
158+
except ValueError:
159+
continue
160+
if target.is_file():
161+
resolved.append(target)
162+
return resolved
163+
164+
165+
async def handler(args: dict[str, Any] | None = None) -> dict[str, Any]:
166+
args = args or {}
167+
rel_path = args.get("rel_path") or None
168+
body_arg = args.get("body")
169+
fmt = args.get("format", "pdf")
170+
if fmt not in _ALLOWED_FORMATS:
171+
return {"error": f"unsupported format: {fmt!r}"}
172+
173+
pandoc = _check_pandoc()
174+
if not pandoc:
175+
return {
176+
"error": (
177+
"pandoc not installed on this host. Install with "
178+
"`brew install pandoc` (macOS) or `apt install pandoc` "
179+
"(Linux). For PDF export also install a TeX engine: "
180+
"`brew install --cask mactex-no-gui` or "
181+
"`apt install texlive-xetex`."
182+
)
183+
}
184+
185+
from mcp_server.infrastructure.config import METHODOLOGY_DIR
186+
187+
wiki_root = METHODOLOGY_DIR / "wiki"
188+
189+
try:
190+
markdown = _read_body(wiki_root, rel_path, body_arg)
191+
except Exception as e:
192+
return {"error": f"cannot read source: {e}"}
193+
194+
bib_hints = args.get("bibliography") or _extract_bibliography_hint(markdown)
195+
if not bib_hints:
196+
# Fall back to all bib files in _bibliography/
197+
bib_dir = wiki_root / "_bibliography"
198+
if bib_dir.exists():
199+
bib_hints = [p.name for p in bib_dir.glob("*.bib")]
200+
bib_files = _resolve_bibliography_paths(wiki_root, bib_hints)
201+
202+
meta = _ALLOWED_FORMATS[fmt]
203+
204+
with tempfile.TemporaryDirectory(prefix="cortex-export-") as tmpdir:
205+
tmp = Path(tmpdir)
206+
src = tmp / "page.md"
207+
src.write_text(markdown, encoding="utf-8")
208+
out = tmp / f"out.{meta['ext']}"
209+
210+
cmd: list[str] = [pandoc, str(src), "-o", str(out)]
211+
cmd.extend(["--from", "markdown"])
212+
cmd.extend(["--to", meta["pandoc_to"]])
213+
cmd.extend(["--standalone"])
214+
if bib_files:
215+
cmd.extend(["--citeproc"])
216+
for bf in bib_files:
217+
cmd.extend(["--bibliography", str(bf)])
218+
if meta["engine"]:
219+
cmd.extend([f"--pdf-engine={meta['engine']}"])
220+
cmd.extend(_filter_pandoc_args(args.get("pandoc_args")))
221+
222+
try:
223+
completed = subprocess.run(
224+
cmd,
225+
cwd=tmp,
226+
capture_output=True,
227+
text=True,
228+
timeout=90,
229+
check=False,
230+
env={**os.environ, "HOME": os.environ.get("HOME", "/tmp")},
231+
)
232+
except subprocess.TimeoutExpired:
233+
return {"error": "pandoc timed out after 90s"}
234+
except Exception as e:
235+
return {"error": f"pandoc invocation failed: {e}"}
236+
237+
if completed.returncode != 0:
238+
return {
239+
"error": "pandoc exited non-zero",
240+
"stderr": (completed.stderr or "")[:2000],
241+
"stdout": (completed.stdout or "")[:500],
242+
}
243+
if not out.exists():
244+
return {"error": "pandoc did not produce an output file"}
245+
246+
data = out.read_bytes()
247+
return {
248+
"ok": True,
249+
"format": fmt,
250+
"bytes": len(data),
251+
"mime": {
252+
"pdf": "application/pdf",
253+
"tex": "application/x-tex",
254+
"docx": (
255+
"application/vnd.openxmlformats-officedocument."
256+
"wordprocessingml.document"
257+
),
258+
"html": "text/html",
259+
}[fmt],
260+
"content_base64": _to_base64(data),
261+
"bibliography_used": [str(p.name) for p in bib_files],
262+
}
263+
264+
265+
def _to_base64(data: bytes) -> str:
266+
import base64
267+
268+
return base64.b64encode(data).decode("ascii")

mcp_server/server/http_standalone.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ def do_GET(self):
281281
self._serve_wiki_db("bibliography")
282282
elif path_no_qs == "/api/wiki/bibliography/read":
283283
self._serve_wiki_db("bibliography_read")
284+
elif path_no_qs == "/api/wiki/export":
285+
self._serve_wiki_export()
284286
elif self.path == "/api/sankey" or self.path.startswith("/api/sankey?"):
285287
self._serve_sankey()
286288
elif self.path.startswith("/api/file-diff?"):
@@ -420,6 +422,47 @@ def _serve_wiki_db(self, op: str):
420422
self.end_headers()
421423
self.wfile.write(json.dumps({"error": str(e)}).encode())
422424

425+
def _serve_wiki_export(self):
426+
"""GET /api/wiki/export?path=X&format=pdf|tex|docx|html
427+
428+
Returns the rendered file as a direct download.
429+
"""
430+
try:
431+
import asyncio
432+
import base64
433+
434+
from mcp_server.handlers.wiki_export import handler as _export
435+
436+
qs = self._qs_map()
437+
rel_path = qs.get("path", "")
438+
fmt = qs.get("format", "pdf")
439+
result = asyncio.run(_export({"rel_path": rel_path, "format": fmt}))
440+
if not result.get("ok"):
441+
body = json.dumps(result, default=str).encode()
442+
self.send_response(200)
443+
self.send_header("Content-Type", "application/json")
444+
self.send_header("Access-Control-Allow-Origin", "*")
445+
self.end_headers()
446+
self.wfile.write(body)
447+
return
448+
data = base64.b64decode(result["content_base64"])
449+
filename = (rel_path.split("/")[-1] or "page").rsplit(".", 1)[0]
450+
self.send_response(200)
451+
self.send_header("Content-Type", result["mime"])
452+
self.send_header(
453+
"Content-Disposition",
454+
f'attachment; filename="{filename}.{result["format"]}"',
455+
)
456+
self.send_header("Content-Length", str(len(data)))
457+
self.send_header("Access-Control-Allow-Origin", "*")
458+
self.end_headers()
459+
self.wfile.write(data)
460+
except Exception as e:
461+
self.send_response(500)
462+
self.send_header("Content-Type", "application/json")
463+
self.end_headers()
464+
self.wfile.write(json.dumps({"error": str(e)}).encode())
465+
423466
def _serve_wiki_save(self):
424467
"""POST /api/wiki/save — body: JSON {rel_path, body}."""
425468
try:

mcp_server/server/http_viz_server.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ def do_GET(self):
167167
self._serve_wiki_bibliography()
168168
elif path_no_qs == "/api/wiki/bibliography/read":
169169
self._serve_wiki_bibliography_read()
170+
elif path_no_qs == "/api/wiki/export":
171+
self._serve_wiki_export()
170172
elif self.path.startswith("/js/") and self.path.endswith(".js"):
171173
serve_static_file(self, js_dir, self.path[4:], "application/javascript")
172174
elif self.path.startswith("/css/") and self.path.endswith(".css"):
@@ -320,6 +322,42 @@ def _serve_wiki_bibliography_read(self):
320322
except Exception as e:
321323
send_error_response(self, e)
322324

325+
def _serve_wiki_export(self):
326+
"""GET /api/wiki/export?path=X&format=pdf|tex|docx|html
327+
328+
Streams the Pandoc-rendered bytes with the correct MIME
329+
type + Content-Disposition so the browser triggers a file
330+
download. Never exposes the base64 blob over HTTP — that
331+
path is reserved for the MCP tool.
332+
"""
333+
try:
334+
import asyncio
335+
import base64
336+
337+
from mcp_server.handlers.wiki_export import handler as _export
338+
339+
qs = self._qs()
340+
rel_path = qs.get("path", "")
341+
fmt = qs.get("format", "pdf")
342+
result = asyncio.run(_export({"rel_path": rel_path, "format": fmt}))
343+
if not result.get("ok"):
344+
send_json_response(self, result)
345+
return
346+
data = base64.b64decode(result["content_base64"])
347+
filename = (rel_path.split("/")[-1] or "page").rsplit(".", 1)[0]
348+
self.send_response(200)
349+
self.send_header("Content-Type", result["mime"])
350+
self.send_header(
351+
"Content-Disposition",
352+
f'attachment; filename="{filename}.{result["format"]}"',
353+
)
354+
self.send_header("Content-Length", str(len(data)))
355+
self.send_header("Access-Control-Allow-Origin", "*")
356+
self.end_headers()
357+
self.wfile.write(data)
358+
except Exception as e:
359+
send_error_response(self, e)
360+
323361
def _serve_wiki_save(self):
324362
"""POST /api/wiki/save — body: JSON {rel_path, body}."""
325363
try:

0 commit comments

Comments
 (0)