Skip to content

Commit 5c04535

Browse files
committed
fix(wiki export): strip frontmatter before pandoc; re-inject as --metadata
Real-world pages have frontmatter like: title: Decision: We decided to migrate from MySQL ^^^^^^^^ unquoted colon Pandoc's YAML parser (stricter than our own) rejects this with 'mapping values are not allowed in this context', blocking export. Fix: pre-split the frontmatter in Python (tolerant key:value regex) and feed only the body to pandoc with --from markdown-yaml_metadata_block (explicit suppression). Title / subtitle / author / date / abstract are re-injected as pandoc --metadata flags so the title block still renders correctly in PDF/DOCX output. Tested on a real backfilled ADR with unquoted colon in title — extracts the full title string and the body cleanly.
1 parent eed955d commit 5c04535

1 file changed

Lines changed: 50 additions & 2 deletions

File tree

mcp_server/handlers/wiki_export.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,41 @@ def _read_body(wiki_root: Path, rel_path: str | None, body: str | None) -> str:
118118
return content
119119

120120

121+
def _split_frontmatter(markdown: str) -> tuple[dict, str]:
122+
"""Parse a best-effort frontmatter block out of ``markdown``.
123+
124+
Returns (fields, body). Never raises; on any parse issue returns
125+
({}, markdown). This is intentionally forgiving because the wiki
126+
writes titles with unquoted colons ("Decision: Use Postgres")
127+
that strict YAML parsers reject. We extract what we can, then
128+
feed only the body to pandoc and re-inject title/author/abstract
129+
as pandoc --metadata flags.
130+
"""
131+
if not markdown.startswith("---"):
132+
return {}, markdown
133+
end = markdown.find("\n---", 3)
134+
if end < 0:
135+
return {}, markdown
136+
raw = markdown[3:end].strip("\n")
137+
body = markdown[end + 4 :].lstrip("\n")
138+
fields: dict[str, str] = {}
139+
for line in raw.splitlines():
140+
# Match `key: value` with value running to end-of-line; values
141+
# can contain any character including further colons.
142+
m = re.match(r"^([A-Za-z_][\w-]*)\s*:\s*(.*)$", line)
143+
if not m:
144+
continue
145+
k, v = m.group(1), m.group(2).strip()
146+
# Strip surrounding quotes if present; otherwise keep as-is.
147+
if len(v) >= 2 and v[0] == v[-1] and v[0] in ("'", '"'):
148+
v = v[1:-1]
149+
fields[k] = v
150+
return fields, body
151+
152+
153+
_META_KEYS_FOR_PANDOC = ("title", "subtitle", "author", "date", "abstract")
154+
155+
121156
def _extract_bibliography_hint(markdown: str) -> list[str]:
122157
"""Very small frontmatter reader — only looks for a bibliography:
123158
inline list. Full parsing lives in core/wiki_pages; this module
@@ -221,16 +256,29 @@ async def handler(args: dict[str, Any] | None = None) -> dict[str, Any]:
221256

222257
meta = _ALLOWED_FORMATS[fmt]
223258

259+
# Strip the wiki's YAML frontmatter before handing to pandoc —
260+
# the wiki writes unquoted colons in titles ("Decision: Foo")
261+
# which pandoc's stricter YAML parser rejects. We re-inject the
262+
# interesting metadata fields through pandoc's --metadata flags
263+
# instead.
264+
fm, body_only = _split_frontmatter(markdown)
265+
224266
with tempfile.TemporaryDirectory(prefix="cortex-export-") as tmpdir:
225267
tmp = Path(tmpdir)
226268
src = tmp / "page.md"
227-
src.write_text(markdown, encoding="utf-8")
269+
src.write_text(body_only, encoding="utf-8")
228270
out = tmp / f"out.{meta['ext']}"
229271

230272
cmd: list[str] = [pandoc, str(src), "-o", str(out)]
231-
cmd.extend(["--from", "markdown"])
273+
# Explicit suppression of YAML metadata block parsing — the
274+
# source we write has no frontmatter, but be defensive.
275+
cmd.extend(["--from", "markdown-yaml_metadata_block"])
232276
cmd.extend(["--to", meta["pandoc_to"]])
233277
cmd.extend(["--standalone"])
278+
# Re-inject metadata from the stripped frontmatter.
279+
for key in _META_KEYS_FOR_PANDOC:
280+
if fm.get(key):
281+
cmd.extend(["--metadata", f"{key}={fm[key]}"])
234282
if bib_files:
235283
cmd.extend(["--citeproc"])
236284
for bf in bib_files:

0 commit comments

Comments
 (0)