Skip to content

Commit 41eaec7

Browse files
committed
Add utilities for parsing markdown to HTML/MathML.
1 parent 0a946ee commit 41eaec7

4 files changed

Lines changed: 311 additions & 0 deletions

File tree

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import unittest
2+
3+
from contentcuration.utils.assessment.markdown import render_markdown
4+
from contentcuration.utils.assessment.qti import ElementTreeBase
5+
6+
7+
class TexMathTestMixin:
8+
"""Mixin providing test methods for TexMath plugin tests"""
9+
10+
def _assert_conversion(self, markdown_text: str, expected: str):
11+
"""Override in subclasses to define assertion behavior"""
12+
raise NotImplementedError("Subclasses must implement _assert_conversion")
13+
14+
def test_markdown_with_inline_math(self):
15+
"""Test conversion of markdown with inline math to HTML + MathML"""
16+
17+
markdown_text = (
18+
"What is the answer to this *question*? $$x\cdot y=z^2$$" # noqa W605
19+
)
20+
expected = (
21+
"<p>What is the answer to this <em>question</em>? "
22+
'<math display="inline">'
23+
"<mrow><mi>x</mi><mi>·</mi><mi>y</mi><mo>=</mo><msup><mi>z</mi><mn>2</mn></msup></mrow>"
24+
"</math></p>\n"
25+
)
26+
27+
self._assert_conversion(markdown_text, expected)
28+
29+
def test_block_math(self):
30+
"""Test conversion of block math"""
31+
32+
markdown_text = (
33+
"Here's an equation:\n\n$$E = mc^2$$\n\nThat's Einstein's formula."
34+
)
35+
expected = (
36+
"<p>Here's an equation:</p>\n"
37+
'<math display="block">'
38+
"<mrow><mi>E</mi><mo>=</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow>"
39+
"</math>"
40+
"<p>That's Einstein's formula.</p>\n"
41+
)
42+
43+
self._assert_conversion(markdown_text, expected)
44+
45+
def test_multiline_block_math(self):
46+
"""
47+
Ensure a $$ … $$ block spanning multiple lines is converted to MathML
48+
and the literal $$ delimiters are removed. This currently fails with
49+
the buggy BLOCK_PATTERN because it stops after the first '$'.
50+
"""
51+
markdown_text = (
52+
"$$\n"
53+
"\\begin{aligned}\n"
54+
"a = b + c \\\\\n"
55+
"$5 = d + e\n"
56+
"\\end{aligned}\n"
57+
"$$"
58+
)
59+
expected = (
60+
'<math display="block">'
61+
'<mrow><mrow><mi>a</mi><mo>=</mo><mi>b</mi><mo>+</mo><mi>c</mi><mspace linebreak="newline" /><mi>$</mi><mn>5</mn><mo>=</mo><mi>d</mi><mo>+</mo><mi>e</mi></mrow></mrow>' # noqa: E501
62+
"</math>"
63+
)
64+
65+
self._assert_conversion(markdown_text, expected)
66+
67+
def test_inline_math_with_dollar_inside(self):
68+
"""
69+
Ensure a $$ … $$ inline that contains an internal '$' (e.g. inside
70+
\\text{}) is parsed correctly. With the old BLOCK_PATTERN the first '$'
71+
prematurely terminates the match so the delimiters remain.
72+
"""
73+
markdown_text = "Test this $$\\text{Cost = 1.00 $USD$}$$"
74+
expected = (
75+
"<p>Test this "
76+
'<math display="inline">'
77+
"<mrow><mtext>Cost = 1.00 $USD$</mtext></mrow>"
78+
"</math></p>\n"
79+
)
80+
81+
self._assert_conversion(markdown_text, expected)
82+
83+
def test_multiple_math_expressions(self):
84+
"""Test multiple math expressions in one document"""
85+
86+
markdown_text = "First: $$a + b$$, then $$c \\times d$$, finally $$e^f$$."
87+
expected = (
88+
"<p>First: "
89+
'<math display="inline"><mrow><mi>a</mi><mo>+</mo><mi>b</mi></mrow></math>'
90+
", then "
91+
'<math display="inline"><mrow><mi>c</mi><mi>×</mi><mi>d</mi></mrow></math>'
92+
", finally "
93+
'<math display="inline"><mrow><msup><mi>e</mi><mi>f</mi></msup></mrow></math>'
94+
".</p>\n"
95+
)
96+
97+
self._assert_conversion(markdown_text, expected)
98+
99+
def test_mixed_inline_and_block(self):
100+
"""Test document with both inline and block math"""
101+
102+
markdown_text = (
103+
"This is inline math: $$a = b$$\n\n"
104+
"And this is block math:\n\n"
105+
"$$\\sum_{i=1}^{n} x_i = y$$\n\n"
106+
"Back to text with more inline: $$z^2$$"
107+
)
108+
expected = (
109+
"<p>This is inline math: "
110+
'<math display="inline"><mrow><mi>a</mi><mo>=</mo><mi>b</mi></mrow></math>'
111+
"</p>\n"
112+
"<p>And this is block math:</p>\n"
113+
'<math display="block">'
114+
"<mrow><msubsup><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi>n</mi></mrow></msubsup>"
115+
"<msub><mi>x</mi><mi>i</mi></msub><mo>=</mo><mi>y</mi></mrow>"
116+
"</math>"
117+
"<p>Back to text with more inline: "
118+
'<math display="inline"><mrow><msup><mi>z</mi><mn>2</mn></msup></mrow></math>'
119+
"</p>\n"
120+
)
121+
122+
self._assert_conversion(markdown_text, expected)
123+
124+
def test_no_math_content(self):
125+
"""Test that regular markdown without math still works"""
126+
127+
markdown_text = "This is just *regular* markdown with **bold** text."
128+
expected = "<p>This is just <em>regular</em> markdown with <strong>bold</strong> text.</p>\n"
129+
130+
self._assert_conversion(markdown_text, expected)
131+
132+
def test_simple_inline_math(self):
133+
"""Test simple inline math expression"""
134+
135+
markdown_text = "The variable $$x$$ is unknown."
136+
expected = (
137+
"<p>The variable "
138+
'<math display="inline"><mrow><mi>x</mi></mrow></math>'
139+
" is unknown.</p>\n"
140+
)
141+
142+
self._assert_conversion(markdown_text, expected)
143+
144+
def test_simple_block_math(self):
145+
"""Test simple block math expression"""
146+
147+
markdown_text = "$$y = mx + b$$"
148+
expected = (
149+
'<math display="block">'
150+
"<mrow><mi>y</mi><mo>=</mo><mi>m</mi><mi>x</mi><mo>+</mo><mi>b</mi></mrow>"
151+
"</math>"
152+
)
153+
154+
self._assert_conversion(markdown_text, expected)
155+
156+
157+
class TestTexMathPlugin(TexMathTestMixin, unittest.TestCase):
158+
"""Test direct markdown conversion: markdown → HTML+MathML"""
159+
160+
def _assert_conversion(self, markdown_text: str, expected: str):
161+
"""Test direct markdown to HTML+MathML conversion"""
162+
result = render_markdown(markdown_text)
163+
self.assertEqual(result, expected)
164+
165+
166+
class TestTexMathPluginRoundtrip(TexMathTestMixin, unittest.TestCase):
167+
"""Test full roundtrip: markdown → HTML+MathML → Pydantic → string"""
168+
169+
maxDiff = None
170+
171+
def _assert_conversion(self, markdown_text: str, expected: str):
172+
"""Test full roundtrip conversion via Pydantic objects"""
173+
result = render_markdown(markdown_text)
174+
175+
# Parse to Pydantic objects and back to string
176+
parsed = ElementTreeBase.from_string(result)
177+
roundtrip_result = (
178+
"".join(e.to_xml_string().strip() for e in parsed)
179+
if isinstance(parsed, list)
180+
else parsed.to_xml_string().strip()
181+
)
182+
183+
self.assertEqual(roundtrip_result, expected.replace("\n", "").strip())
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import re
2+
import xml.etree.ElementTree as ET
3+
4+
from latex2mathml.converter import convert
5+
from markdown_it import MarkdownIt
6+
from markdown_it.renderer import RendererProtocol
7+
from markdown_it.rules_block import StateBlock
8+
from markdown_it.rules_inline import StateInline
9+
from markdown_it.token import Token
10+
from markdown_it.utils import EnvType
11+
from markdown_it.utils import OptionsDict
12+
13+
14+
# Regex patterns for $$ delimited math
15+
INLINE_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$")
16+
BLOCK_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$", re.M)
17+
18+
19+
def math_inline_func(state: StateInline, silent: bool) -> bool:
20+
"""Parse inline math: $$expression$$"""
21+
if not state.src.startswith("$$", state.pos):
22+
return False
23+
24+
match = INLINE_PATTERN.match(state.src[state.pos :])
25+
if not match:
26+
return False
27+
28+
if not silent:
29+
token = state.push("math_inline", "math", 0)
30+
token.content = match.group(1)
31+
token.markup = "$$"
32+
33+
state.pos += match.end()
34+
return True
35+
36+
37+
def math_block_func(
38+
state: StateBlock, begLine: int, endLine: int, silent: bool
39+
) -> bool:
40+
"""Parse block math: $$expression$$"""
41+
begin = state.bMarks[begLine] + state.tShift[begLine]
42+
43+
if not state.src.startswith("$$", begin):
44+
return False
45+
46+
match = BLOCK_PATTERN.match(state.src[begin:])
47+
if not match:
48+
return False
49+
50+
if not silent:
51+
token = state.push("math_block", "math", 0)
52+
token.block = True
53+
token.content = match.group(1)
54+
token.markup = "$$"
55+
56+
# Advance to next line after the math block
57+
endpos = begin + match.end() - 1
58+
line = begLine
59+
while line < endLine:
60+
if endpos >= state.bMarks[line] and endpos <= state.eMarks[line]:
61+
state.line = line + 1
62+
break
63+
line += 1
64+
65+
return True
66+
67+
68+
def _convert(latex, inline=True):
69+
# Remove the namespace declaration for cleaner output
70+
markup = convert(latex, display="inline" if inline else "block").replace(
71+
' xmlns="http://www.w3.org/1998/Math/MathML"', ""
72+
)
73+
# By default latex2mathml encodes operators that don't need to be encoded
74+
# so we parse it with ElementTree and turn it back into a string here for consistency.
75+
wrapped_markup = f"<root>{markup}</root>"
76+
root = ET.fromstring(wrapped_markup)
77+
return "".join(ET.tostring(e, encoding="unicode") for e in root)
78+
79+
80+
def render_math_inline(
81+
self: RendererProtocol,
82+
tokens: list[Token],
83+
idx: int,
84+
options: OptionsDict,
85+
env: EnvType,
86+
) -> str:
87+
"""Render inline math to MathML"""
88+
return _convert(tokens[idx].content)
89+
90+
91+
def render_math_block(
92+
self: RendererProtocol,
93+
tokens: list[Token],
94+
idx: int,
95+
options: OptionsDict,
96+
env: EnvType,
97+
) -> str:
98+
"""Render block math to MathML"""
99+
return _convert(tokens[idx].content, inline=False)
100+
101+
102+
def texmath_to_mathml_plugin(md: MarkdownIt) -> None:
103+
"""Simple plugin for parsing TeX math with $$ delimiters.
104+
105+
Converts inline and block math expressions to MathML using latex2mathml.
106+
"""
107+
# Register parsing rules
108+
md.inline.ruler.before("escape", "math_inline", math_inline_func)
109+
md.block.ruler.before("fence", "math_block", math_block_func)
110+
111+
# Register renderers
112+
md.add_render_rule("math_inline", render_math_inline)
113+
md.add_render_rule("math_block", render_math_block)
114+
115+
116+
md = MarkdownIt("gfm-like").disable("linkify").use(texmath_to_mathml_plugin)
117+
118+
119+
def render_markdown(markdown):
120+
return md.render(markdown)

requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,5 @@ django-celery-results
3535
packaging>=21.0
3636
langcodes==3.5.0
3737
pydantic==2.11.5
38+
latex2mathml==3.78.0
39+
markdown-it-py==3.0.0

requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,16 @@ langcodes==3.5.0
160160
# via -r requirements.in
161161
language-data==1.3.0
162162
# via langcodes
163+
latex2mathml==3.78.0
164+
# via -r requirements.in
163165
le-utils==0.2.10
164166
# via -r requirements.in
165167
marisa-trie==1.2.1
166168
# via language-data
169+
markdown-it-py==3.0.0
170+
# via -r requirements.in
171+
mdurl==0.1.2
172+
# via markdown-it-py
167173
packaging==25.0
168174
# via
169175
# -r requirements.in

0 commit comments

Comments
 (0)