Skip to content

Commit 9e25d56

Browse files
committed
Add utilities for parsing markdown to HTML/MathML.
1 parent a22d826 commit 9e25d56

4 files changed

Lines changed: 343 additions & 0 deletions

File tree

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import unittest
2+
3+
from contentcuration.utils.assessment.markdown import render_markdown
4+
from contentcuration.utils.assessment.qti import ElementTreeBase
5+
6+
7+
class TexMathTestMixin:
8+
"""Mixin providing test methods for TexMath plugin tests"""
9+
10+
maxDiff = None
11+
12+
def _assert_conversion(self, markdown_text: str, expected: str):
13+
"""Override in subclasses to define assertion behavior"""
14+
raise NotImplementedError("Subclasses must implement _assert_conversion")
15+
16+
def test_markdown_with_inline_math(self):
17+
"""Test conversion of markdown with inline math to HTML + MathML"""
18+
19+
markdown_text = (
20+
"What is the answer to this *question*? $$x\cdot y=z^2$$" # noqa W605
21+
)
22+
expected = (
23+
"<p>What is the answer to this <em>question</em>? "
24+
'<math display="inline">'
25+
"<semantics><mrow><mi>x</mi><mi>·</mi><mi>y</mi><mo>=</mo><msup><mi>z</mi><mn>2</mn></msup></mrow>"
26+
'<annotation encoding="application/x-tex">x\cdot y=z^2</annotation></semantics>' # noqa W605
27+
"</math></p>\n"
28+
)
29+
30+
self._assert_conversion(markdown_text, expected)
31+
32+
def test_block_math(self):
33+
"""Test conversion of block math"""
34+
35+
markdown_text = (
36+
"Here's an equation:\n\n$$E = mc^2$$\n\nThat's Einstein's formula."
37+
)
38+
expected = (
39+
"<p>Here's an equation:</p>\n"
40+
'<math display="block">'
41+
"<semantics><mrow><mi>E</mi><mo>=</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow>"
42+
'<annotation encoding="application/x-tex">E = mc^2</annotation></semantics>'
43+
"</math>"
44+
"<p>That's Einstein's formula.</p>\n"
45+
)
46+
47+
self._assert_conversion(markdown_text, expected)
48+
49+
def test_multiline_block_math(self):
50+
"""
51+
Ensure a $$ … $$ block spanning multiple lines is converted to MathML
52+
and the literal $$ delimiters are removed. This currently fails with
53+
the buggy BLOCK_PATTERN because it stops after the first '$'.
54+
"""
55+
markdown_text = (
56+
"$$\n"
57+
"\\begin{aligned}\n"
58+
"a = b + c \\\\\n"
59+
"$5 = d + e\n"
60+
"\\end{aligned}\n"
61+
"$$"
62+
)
63+
expected = (
64+
'<math display="block">'
65+
"<semantics><mrow><mrow><mi>a</mi><mo>=</mo><mi>b</mi><mo>+</mo><mi>c</mi>"
66+
'<mspace linebreak="newline" /><mi>$</mi><mn>5</mn><mo>=</mo><mi>d</mi><mo>+</mo><mi>e</mi></mrow></mrow>'
67+
'<annotation encoding="application/x-tex">\n\\begin{aligned}\na = b + c \\\\\n$5 = d + e\n\\end{aligned}\n</annotation></semantics>'
68+
"</math>"
69+
)
70+
71+
self._assert_conversion(markdown_text, expected)
72+
73+
def test_inline_math_with_dollar_inside(self):
74+
"""
75+
Ensure a $$ … $$ inline that contains an internal '$' (e.g. inside
76+
\\text{}) is parsed correctly. With the old BLOCK_PATTERN the first '$'
77+
prematurely terminates the match so the delimiters remain.
78+
"""
79+
markdown_text = "Test this $$\\text{Cost = 1.00 $USD$}$$"
80+
expected = (
81+
"<p>Test this "
82+
'<math display="inline">'
83+
"<semantics><mrow><mtext>Cost = 1.00 $USD$</mtext></mrow>"
84+
'<annotation encoding="application/x-tex">\\text{Cost = 1.00 $USD$}</annotation></semantics>'
85+
"</math></p>\n"
86+
)
87+
88+
self._assert_conversion(markdown_text, expected)
89+
90+
def test_multiple_math_expressions(self):
91+
"""Test multiple math expressions in one document"""
92+
93+
markdown_text = "First: $$a + b$$, then $$c \\times d$$, finally $$e^f$$."
94+
expected = (
95+
"<p>First: "
96+
'<math display="inline"><semantics><mrow><mi>a</mi><mo>+</mo><mi>b</mi></mrow>'
97+
'<annotation encoding="application/x-tex">a + b</annotation></semantics></math>'
98+
", then "
99+
'<math display="inline"><semantics><mrow><mi>c</mi><mi>×</mi><mi>d</mi></mrow>'
100+
'<annotation encoding="application/x-tex">c \\times d</annotation></semantics></math>'
101+
", finally "
102+
'<math display="inline"><semantics><mrow><msup><mi>e</mi><mi>f</mi></msup></mrow>'
103+
'<annotation encoding="application/x-tex">e^f</annotation></semantics></math>'
104+
".</p>\n"
105+
)
106+
107+
self._assert_conversion(markdown_text, expected)
108+
109+
def test_mixed_inline_and_block(self):
110+
"""Test document with both inline and block math"""
111+
112+
markdown_text = (
113+
"This is inline math: $$a = b$$\n\n"
114+
"And this is block math:\n\n"
115+
"$$\\sum_{i=1}^{n} x_i = y$$\n\n"
116+
"Back to text with more inline: $$z^2$$"
117+
)
118+
expected = (
119+
"<p>This is inline math: "
120+
'<math display="inline"><semantics><mrow><mi>a</mi><mo>=</mo><mi>b</mi></mrow>'
121+
'<annotation encoding="application/x-tex">a = b</annotation></semantics></math>'
122+
"</p>\n"
123+
"<p>And this is block math:</p>\n"
124+
'<math display="block">'
125+
"<semantics><mrow><msubsup><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mrow>"
126+
"<mi>n</mi></mrow></msubsup><msub><mi>x</mi><mi>i</mi></msub><mo>=</mo><mi>y</mi></mrow>"
127+
'<annotation encoding="application/x-tex">\sum_{i=1}^{n} x_i = y</annotation></semantics>' # noqa W605
128+
"</math>"
129+
"<p>Back to text with more inline: "
130+
'<math display="inline"><semantics><mrow><msup><mi>z</mi><mn>2</mn></msup></mrow>'
131+
'<annotation encoding="application/x-tex">z^2</annotation></semantics></math>'
132+
"</p>\n"
133+
)
134+
135+
self._assert_conversion(markdown_text, expected)
136+
137+
def test_no_math_content(self):
138+
"""Test that regular markdown without math still works"""
139+
140+
markdown_text = "This is just *regular* markdown with **bold** text."
141+
expected = "<p>This is just <em>regular</em> markdown with <strong>bold</strong> text.</p>\n"
142+
143+
self._assert_conversion(markdown_text, expected)
144+
145+
def test_simple_inline_math(self):
146+
"""Test simple inline math expression"""
147+
148+
markdown_text = "The variable $$x$$ is unknown."
149+
expected = (
150+
"<p>The variable "
151+
'<math display="inline"><semantics><mrow><mi>x</mi></mrow>'
152+
'<annotation encoding="application/x-tex">x</annotation></semantics></math>'
153+
" is unknown.</p>\n"
154+
)
155+
156+
self._assert_conversion(markdown_text, expected)
157+
158+
def test_simple_block_math(self):
159+
"""Test simple block math expression"""
160+
161+
markdown_text = "$$y = mx + b$$"
162+
expected = (
163+
'<math display="block">'
164+
"<semantics><mrow><mi>y</mi><mo>=</mo><mi>m</mi><mi>x</mi><mo>+</mo><mi>b</mi></mrow>"
165+
'<annotation encoding="application/x-tex">y = mx + b</annotation></semantics>'
166+
"</math>"
167+
)
168+
169+
self._assert_conversion(markdown_text, expected)
170+
171+
172+
class TestTexMathPlugin(TexMathTestMixin, unittest.TestCase):
173+
"""Test direct markdown conversion: markdown → HTML+MathML"""
174+
175+
def _assert_conversion(self, markdown_text: str, expected: str):
176+
"""Test direct markdown to HTML+MathML conversion"""
177+
result = render_markdown(markdown_text)
178+
self.assertEqual(result, expected)
179+
180+
181+
class TestTexMathPluginRoundtrip(TexMathTestMixin, unittest.TestCase):
182+
"""Test full roundtrip: markdown → HTML+MathML → Pydantic → string"""
183+
184+
maxDiff = None
185+
186+
def _assert_conversion(self, markdown_text: str, expected: str):
187+
"""Test full roundtrip conversion via Pydantic objects"""
188+
result = render_markdown(markdown_text)
189+
190+
# Parse to Pydantic objects and back to string
191+
parsed = ElementTreeBase.from_string(result)
192+
roundtrip_result = (
193+
"".join(e.to_xml_string().strip() for e in parsed)
194+
if isinstance(parsed, list)
195+
else parsed.to_xml_string().strip()
196+
)
197+
self.assertEqual(
198+
roundtrip_result.replace("\n", "").strip(),
199+
expected.replace("\n", "").strip(),
200+
)
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import re
2+
import xml.etree.ElementTree as ET
3+
4+
from latex2mathml.converter import convert
5+
from markdown_it import MarkdownIt
6+
from markdown_it.renderer import RendererProtocol
7+
from markdown_it.rules_block import StateBlock
8+
from markdown_it.rules_inline import StateInline
9+
from markdown_it.token import Token
10+
from markdown_it.utils import EnvType
11+
from markdown_it.utils import OptionsDict
12+
13+
from contentcuration.utils.assessment.qti.mathml.core import Annotation
14+
from contentcuration.utils.assessment.qti.mathml.core import Semantics
15+
16+
17+
# Regex patterns for $$ delimited math
18+
INLINE_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$")
19+
BLOCK_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$", re.M)
20+
21+
22+
def math_inline_func(state: StateInline, silent: bool) -> bool:
23+
"""Parse inline math: $$expression$$"""
24+
if not state.src.startswith("$$", state.pos):
25+
return False
26+
27+
match = INLINE_PATTERN.match(state.src[state.pos :])
28+
if not match:
29+
return False
30+
31+
if not silent:
32+
token = state.push("math_inline", "math", 0)
33+
token.content = match.group(1)
34+
token.markup = "$$"
35+
36+
state.pos += match.end()
37+
return True
38+
39+
40+
def math_block_func(
41+
state: StateBlock, begLine: int, endLine: int, silent: bool
42+
) -> bool:
43+
"""Parse block math: $$expression$$"""
44+
begin = state.bMarks[begLine] + state.tShift[begLine]
45+
46+
if not state.src.startswith("$$", begin):
47+
return False
48+
49+
match = BLOCK_PATTERN.match(state.src[begin:])
50+
if not match:
51+
return False
52+
53+
if not silent:
54+
token = state.push("math_block", "math", 0)
55+
token.block = True
56+
token.content = match.group(1)
57+
token.markup = "$$"
58+
59+
# Advance to next line after the math block
60+
endpos = begin + match.end() - 1
61+
line = begLine
62+
while line < endLine:
63+
if endpos >= state.bMarks[line] and endpos <= state.eMarks[line]:
64+
state.line = line + 1
65+
break
66+
line += 1
67+
68+
return True
69+
70+
71+
def _convert(latex, inline=True):
72+
# Remove the namespace declaration for cleaner output
73+
markup = convert(latex, display="inline" if inline else "block").replace(
74+
' xmlns="http://www.w3.org/1998/Math/MathML"', ""
75+
)
76+
# By default latex2mathml encodes operators that don't need to be encoded
77+
# so we parse it with ElementTree and turn it back into a string here for consistency.
78+
math_element = ET.fromstring(markup)
79+
80+
# Create LaTeX annotation
81+
latex_annotation_element = Annotation(
82+
encoding="application/x-tex", children=[latex]
83+
).to_element()
84+
85+
semantics_element = Semantics().to_element()
86+
for child in math_element:
87+
math_element.remove(child)
88+
semantics_element.append(child)
89+
semantics_element.append(latex_annotation_element)
90+
math_element.append(semantics_element)
91+
92+
return ET.tostring(math_element, encoding="unicode")
93+
94+
95+
def render_math_inline(
96+
self: RendererProtocol,
97+
tokens: list[Token],
98+
idx: int,
99+
options: OptionsDict,
100+
env: EnvType,
101+
) -> str:
102+
"""Render inline math to MathML"""
103+
return _convert(tokens[idx].content)
104+
105+
106+
def render_math_block(
107+
self: RendererProtocol,
108+
tokens: list[Token],
109+
idx: int,
110+
options: OptionsDict,
111+
env: EnvType,
112+
) -> str:
113+
"""Render block math to MathML"""
114+
return _convert(tokens[idx].content, inline=False)
115+
116+
117+
def texmath_to_mathml_plugin(md: MarkdownIt) -> None:
118+
"""Simple plugin for parsing TeX math with $$ delimiters.
119+
120+
Converts inline and block math expressions to MathML using latex2mathml.
121+
"""
122+
# Register parsing rules
123+
md.inline.ruler.before("escape", "math_inline", math_inline_func)
124+
md.block.ruler.before("fence", "math_block", math_block_func)
125+
126+
# Register renderers
127+
md.add_render_rule("math_inline", render_math_inline)
128+
md.add_render_rule("math_block", render_math_block)
129+
130+
131+
md = MarkdownIt("gfm-like").disable("linkify").use(texmath_to_mathml_plugin)
132+
133+
134+
def render_markdown(markdown):
135+
return md.render(markdown)

requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,5 @@ django-celery-results
3535
packaging>=21.0
3636
langcodes==3.5.0
3737
pydantic==2.11.5
38+
latex2mathml==3.78.0
39+
markdown-it-py==3.0.0

requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,16 @@ langcodes==3.5.0
160160
# via -r requirements.in
161161
language-data==1.3.0
162162
# via langcodes
163+
latex2mathml==3.78.0
164+
# via -r requirements.in
163165
le-utils==0.2.12
164166
# via -r requirements.in
165167
marisa-trie==1.2.1
166168
# via language-data
169+
markdown-it-py==3.0.0
170+
# via -r requirements.in
171+
mdurl==0.1.2
172+
# via markdown-it-py
167173
packaging==25.0
168174
# via
169175
# -r requirements.in

0 commit comments

Comments
 (0)