|
| 1 | +"""Unicode → LaTeX math-command conversion pass. |
| 2 | +
|
| 3 | +article.cls with the default inputenc cannot compile raw Greek or math |
| 4 | +symbols that LLMs routinely emit in prose (``accuracy ≥ 95%``, |
| 5 | +``ρ = 0.7``, ``x²``). This pass rewrites those characters into their |
| 6 | +LaTeX equivalents with the right math-mode wrap: |
| 7 | +
|
| 8 | + - A symbol in prose becomes ``$\\cmd$`` (inline math wrap) |
| 9 | + - A symbol already inside an existing ``$...$`` region becomes just |
| 10 | + ``\\cmd`` (the surrounding region already provides math mode) |
| 11 | +
|
| 12 | +Mapping data lives in ``vibe_sci/data/unicode_to_latex.yaml`` so adding |
| 13 | +a symbol doesn't require a Python change. |
| 14 | +""" |
| 15 | +from __future__ import annotations |
| 16 | + |
| 17 | +import pathlib |
| 18 | +import re |
| 19 | + |
| 20 | +import yaml |
| 21 | + |
| 22 | +_DATA_PATH = pathlib.Path(__file__).parent.parent / "data" / "unicode_to_latex.yaml" |
| 23 | + |
| 24 | +# Compiled once at import. Dict preserves YAML order so iteration |
| 25 | +# ordering is stable across runs. |
| 26 | +_MAPPING: dict[str, str] = {} |
| 27 | + |
| 28 | + |
| 29 | +def _load() -> dict[str, str]: |
| 30 | + global _MAPPING |
| 31 | + if _MAPPING: |
| 32 | + return _MAPPING |
| 33 | + if not _DATA_PATH.exists(): |
| 34 | + return {} |
| 35 | + raw = yaml.safe_load(_DATA_PATH.read_text(encoding="utf-8")) or {} |
| 36 | + # YAML file is a flat dict { "α": "\\alpha", ... }; coerce values to str. |
| 37 | + _MAPPING = {str(k): str(v) for k, v in raw.items() if k and v} |
| 38 | + return _MAPPING |
| 39 | + |
| 40 | + |
| 41 | +# Split a string into alternating (prose, math) regions where math is |
| 42 | +# ``$...$`` inline — greedy-but-single-line to avoid swallowing |
| 43 | +# display math ``$$...$$`` or paragraph breaks. |
| 44 | +_INLINE_MATH_SPLIT = re.compile(r"(\$[^$\n]*\$)") |
| 45 | + |
| 46 | + |
| 47 | +def convert_unicode_math(s: str) -> str: |
| 48 | + """Replace Unicode math symbols with LaTeX commands. |
| 49 | +
|
| 50 | + Prose regions get ``$\\cmd$`` wraps; content already inside ``$...$`` |
| 51 | + gets bare ``\\cmd`` (no extra wrap, since surrounding ``$`` still |
| 52 | + provides math mode). |
| 53 | + """ |
| 54 | + mapping = _load() |
| 55 | + if not mapping: |
| 56 | + return s |
| 57 | + |
| 58 | + parts = _INLINE_MATH_SPLIT.split(s) |
| 59 | + # parts[0::2] = prose (always present, possibly empty) |
| 60 | + # parts[1::2] = existing $...$ regions (including the $s themselves) |
| 61 | + for i, part in enumerate(parts): |
| 62 | + if not part: |
| 63 | + continue |
| 64 | + inside_math = i % 2 == 1 # odd index = $...$ region |
| 65 | + if inside_math: |
| 66 | + # Strip surrounding $ for symbol rewriting; re-wrap after |
| 67 | + inner = part[1:-1] |
| 68 | + for uc, cmd in mapping.items(): |
| 69 | + if uc in inner: |
| 70 | + inner = inner.replace(uc, cmd) |
| 71 | + parts[i] = f"${inner}$" |
| 72 | + else: |
| 73 | + for uc, cmd in mapping.items(): |
| 74 | + if uc in part: |
| 75 | + parts[i] = part = part.replace(uc, f"${cmd}$") |
| 76 | + return "".join(parts) |
0 commit comments