Skip to content

Commit 2ff8e7d

Browse files
committed
Add utilities for parsing markdown to HTML/MathML.
1 parent 6310661 commit 2ff8e7d

4 files changed

Lines changed: 341 additions & 0 deletions

File tree

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import unittest
2+
3+
from contentcuration.utils.assessment.markdown import render_markdown
4+
from contentcuration.utils.assessment.qti import ElementTreeBase
5+
6+
7+
class TexMathTestMixin:
8+
"""Mixin providing test methods for TexMath plugin tests"""
9+
10+
def _assert_conversion(self, markdown_text: str, expected: str):
11+
"""Override in subclasses to define assertion behavior"""
12+
raise NotImplementedError("Subclasses must implement _assert_conversion")
13+
14+
def test_markdown_with_inline_math(self):
15+
"""Test conversion of markdown with inline math to HTML + MathML"""
16+
17+
markdown_text = (
18+
"What is the answer to this *question*? $$x\cdot y=z^2$$" # noqa W605
19+
)
20+
expected = (
21+
"<p>What is the answer to this <em>question</em>? "
22+
'<math display="inline">'
23+
"<semantics><mrow><mi>x</mi><mi>·</mi><mi>y</mi><mo>=</mo><msup><mi>z</mi><mn>2</mn></msup></mrow>"
24+
'<annotation encoding="application/x-tex">x\cdot y=z^2</annotation></semantics>' # noqa W605
25+
"</math></p>\n"
26+
)
27+
28+
self._assert_conversion(markdown_text, expected)
29+
30+
def test_block_math(self):
31+
"""Test conversion of block math"""
32+
33+
markdown_text = (
34+
"Here's an equation:\n\n$$E = mc^2$$\n\nThat's Einstein's formula."
35+
)
36+
expected = (
37+
"<p>Here's an equation:</p>\n"
38+
'<math display="block">'
39+
"<semantics><mrow><mi>E</mi><mo>=</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow>"
40+
'<annotation encoding="application/x-tex">E = mc^2</annotation></semantics>'
41+
"</math>"
42+
"<p>That's Einstein's formula.</p>\n"
43+
)
44+
45+
self._assert_conversion(markdown_text, expected)
46+
47+
def test_multiline_block_math(self):
48+
"""
49+
Ensure a $$ … $$ block spanning multiple lines is converted to MathML
50+
and the literal $$ delimiters are removed. This currently fails with
51+
the buggy BLOCK_PATTERN because it stops after the first '$'.
52+
"""
53+
markdown_text = (
54+
"$$\n"
55+
"\\begin{aligned}\n"
56+
"a = b + c \\\\\n"
57+
"$5 = d + e\n"
58+
"\\end{aligned}\n"
59+
"$$"
60+
)
61+
expected = (
62+
'<math display="block">'
63+
"<semantics><mrow><mrow><mi>a</mi><mo>=</mo><mi>b</mi><mo>+</mo><mi>c</mi>"
64+
'<mspace linebreak="newline" /><mi>$</mi><mn>5</mn><mo>=</mo><mi>d</mi><mo>+</mo><mi>e</mi></mrow></mrow>'
65+
'<annotation encoding="application/x-tex">\n\\begin{aligned}\na = b + c \\\\\n$5 = d + e\n\\end{aligned}\n</annotation></semantics>'
66+
"</math>"
67+
)
68+
69+
self._assert_conversion(markdown_text, expected)
70+
71+
def test_inline_math_with_dollar_inside(self):
72+
"""
73+
Ensure a $$ … $$ inline that contains an internal '$' (e.g. inside
74+
\\text{}) is parsed correctly. With the old BLOCK_PATTERN the first '$'
75+
prematurely terminates the match so the delimiters remain.
76+
"""
77+
markdown_text = "Test this $$\\text{Cost = 1.00 $USD$}$$"
78+
expected = (
79+
"<p>Test this "
80+
'<math display="inline">'
81+
"<semantics><mrow><mtext>Cost = 1.00 $USD$</mtext></mrow>"
82+
'<annotation encoding="application/x-tex">\\text{Cost = 1.00 $USD$}</annotation></semantics>'
83+
"</math></p>\n"
84+
)
85+
86+
self._assert_conversion(markdown_text, expected)
87+
88+
def test_multiple_math_expressions(self):
89+
"""Test multiple math expressions in one document"""
90+
91+
markdown_text = "First: $$a + b$$, then $$c \\times d$$, finally $$e^f$$."
92+
expected = (
93+
"<p>First: "
94+
'<math display="inline"><semantics><mrow><mi>a</mi><mo>+</mo><mi>b</mi></mrow>'
95+
'<annotation encoding="application/x-tex">a + b</annotation></semantics></math>'
96+
", then "
97+
'<math display="inline"><semantics><mrow><mi>c</mi><mi>×</mi><mi>d</mi></mrow>'
98+
'<annotation encoding="application/x-tex">c \\times d</annotation></semantics></math>'
99+
", finally "
100+
'<math display="inline"><semantics><mrow><msup><mi>e</mi><mi>f</mi></msup></mrow>'
101+
'<annotation encoding="application/x-tex">e^f</annotation></semantics></math>'
102+
".</p>\n"
103+
)
104+
105+
self._assert_conversion(markdown_text, expected)
106+
107+
def test_mixed_inline_and_block(self):
108+
"""Test document with both inline and block math"""
109+
110+
markdown_text = (
111+
"This is inline math: $$a = b$$\n\n"
112+
"And this is block math:\n\n"
113+
"$$\\sum_{i=1}^{n} x_i = y$$\n\n"
114+
"Back to text with more inline: $$z^2$$"
115+
)
116+
expected = (
117+
"<p>This is inline math: "
118+
'<math display="inline"><semantics><mrow><mi>a</mi><mo>=</mo><mi>b</mi></mrow>'
119+
'<annotation encoding="application/x-tex">a = b</annotation></semantics></math>'
120+
"</p>\n"
121+
"<p>And this is block math:</p>\n"
122+
'<math display="block">'
123+
"<semantics><mrow><msubsup><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mrow>"
124+
"<mi>n</mi></mrow></msubsup><msub><mi>x</mi><mi>i</mi></msub><mo>=</mo><mi>y</mi></mrow>"
125+
'<annotation encoding="application/x-tex">\sum_{i=1}^{n} x_i = y</annotation></semantics>' # noqa W605
126+
"</math>"
127+
"<p>Back to text with more inline: "
128+
'<math display="inline"><semantics><mrow><msup><mi>z</mi><mn>2</mn></msup></mrow>'
129+
'<annotation encoding="application/x-tex">z^2</annotation></semantics></math>'
130+
"</p>\n"
131+
)
132+
133+
self._assert_conversion(markdown_text, expected)
134+
135+
def test_no_math_content(self):
136+
"""Test that regular markdown without math still works"""
137+
138+
markdown_text = "This is just *regular* markdown with **bold** text."
139+
expected = "<p>This is just <em>regular</em> markdown with <strong>bold</strong> text.</p>\n"
140+
141+
self._assert_conversion(markdown_text, expected)
142+
143+
def test_simple_inline_math(self):
144+
"""Test simple inline math expression"""
145+
146+
markdown_text = "The variable $$x$$ is unknown."
147+
expected = (
148+
"<p>The variable "
149+
'<math display="inline"><semantics><mrow><mi>x</mi></mrow>'
150+
'<annotation encoding="application/x-tex">x</annotation></semantics></math>'
151+
" is unknown.</p>\n"
152+
)
153+
154+
self._assert_conversion(markdown_text, expected)
155+
156+
def test_simple_block_math(self):
157+
"""Test simple block math expression"""
158+
159+
markdown_text = "$$y = mx + b$$"
160+
expected = (
161+
'<math display="block">'
162+
"<semantics><mrow><mi>y</mi><mo>=</mo><mi>m</mi><mi>x</mi><mo>+</mo><mi>b</mi></mrow>"
163+
'<annotation encoding="application/x-tex">y = mx + b</annotation></semantics>'
164+
"</math>"
165+
)
166+
167+
self._assert_conversion(markdown_text, expected)
168+
169+
170+
class TestTexMathPlugin(TexMathTestMixin, unittest.TestCase):
171+
"""Test direct markdown conversion: markdown → HTML+MathML"""
172+
173+
def _assert_conversion(self, markdown_text: str, expected: str):
174+
"""Test direct markdown to HTML+MathML conversion"""
175+
result = render_markdown(markdown_text)
176+
self.assertEqual(result, expected)
177+
178+
179+
class TestTexMathPluginRoundtrip(TexMathTestMixin, unittest.TestCase):
180+
"""Test full roundtrip: markdown → HTML+MathML → Pydantic → string"""
181+
182+
maxDiff = None
183+
184+
def _assert_conversion(self, markdown_text: str, expected: str):
185+
"""Test full roundtrip conversion via Pydantic objects"""
186+
result = render_markdown(markdown_text)
187+
188+
# Parse to Pydantic objects and back to string
189+
parsed = ElementTreeBase.from_string(result)
190+
roundtrip_result = (
191+
"".join(e.to_xml_string().strip() for e in parsed)
192+
if isinstance(parsed, list)
193+
else parsed.to_xml_string().strip()
194+
)
195+
self.assertEqual(
196+
roundtrip_result.replace("\n", "").strip(),
197+
expected.replace("\n", "").strip(),
198+
)
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import re
2+
import xml.etree.ElementTree as ET
3+
4+
from latex2mathml.converter import convert
5+
from markdown_it import MarkdownIt
6+
from markdown_it.renderer import RendererProtocol
7+
from markdown_it.rules_block import StateBlock
8+
from markdown_it.rules_inline import StateInline
9+
from markdown_it.token import Token
10+
from markdown_it.utils import EnvType
11+
from markdown_it.utils import OptionsDict
12+
13+
from contentcuration.utils.assessment.qti.mathml.core import Annotation
14+
from contentcuration.utils.assessment.qti.mathml.core import Semantics
15+
16+
17+
# Regex patterns for $$ delimited math
18+
INLINE_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$")
19+
BLOCK_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$", re.M)
20+
21+
22+
def math_inline_func(state: StateInline, silent: bool) -> bool:
23+
"""Parse inline math: $$expression$$"""
24+
if not state.src.startswith("$$", state.pos):
25+
return False
26+
27+
match = INLINE_PATTERN.match(state.src[state.pos :])
28+
if not match:
29+
return False
30+
31+
if not silent:
32+
token = state.push("math_inline", "math", 0)
33+
token.content = match.group(1)
34+
token.markup = "$$"
35+
36+
state.pos += match.end()
37+
return True
38+
39+
40+
def math_block_func(
41+
state: StateBlock, begLine: int, endLine: int, silent: bool
42+
) -> bool:
43+
"""Parse block math: $$expression$$"""
44+
begin = state.bMarks[begLine] + state.tShift[begLine]
45+
46+
if not state.src.startswith("$$", begin):
47+
return False
48+
49+
match = BLOCK_PATTERN.match(state.src[begin:])
50+
if not match:
51+
return False
52+
53+
if not silent:
54+
token = state.push("math_block", "math", 0)
55+
token.block = True
56+
token.content = match.group(1)
57+
token.markup = "$$"
58+
59+
# Advance to next line after the math block
60+
endpos = begin + match.end() - 1
61+
line = begLine
62+
while line < endLine:
63+
if endpos >= state.bMarks[line] and endpos <= state.eMarks[line]:
64+
state.line = line + 1
65+
break
66+
line += 1
67+
68+
return True
69+
70+
71+
def _convert(latex, inline=True):
72+
# Remove the namespace declaration for cleaner output
73+
markup = convert(latex, display="inline" if inline else "block").replace(
74+
' xmlns="http://www.w3.org/1998/Math/MathML"', ""
75+
)
76+
# By default latex2mathml encodes operators that don't need to be encoded
77+
# so we parse it with ElementTree and turn it back into a string here for consistency.
78+
math_element = ET.fromstring(markup)
79+
80+
# Create LaTeX annotation
81+
latex_annotation_element = Annotation(
82+
encoding="application/x-tex", children=[latex]
83+
).to_element()
84+
85+
semantics_element = Semantics().to_element()
86+
for child in math_element:
87+
math_element.remove(child)
88+
semantics_element.append(child)
89+
semantics_element.append(latex_annotation_element)
90+
math_element.append(semantics_element)
91+
92+
return ET.tostring(math_element, encoding="unicode")
93+
94+
95+
def render_math_inline(
96+
self: RendererProtocol,
97+
tokens: list[Token],
98+
idx: int,
99+
options: OptionsDict,
100+
env: EnvType,
101+
) -> str:
102+
"""Render inline math to MathML"""
103+
return _convert(tokens[idx].content)
104+
105+
106+
def render_math_block(
107+
self: RendererProtocol,
108+
tokens: list[Token],
109+
idx: int,
110+
options: OptionsDict,
111+
env: EnvType,
112+
) -> str:
113+
"""Render block math to MathML"""
114+
return _convert(tokens[idx].content, inline=False)
115+
116+
117+
def texmath_to_mathml_plugin(md: MarkdownIt) -> None:
118+
"""Simple plugin for parsing TeX math with $$ delimiters.
119+
120+
Converts inline and block math expressions to MathML using latex2mathml.
121+
"""
122+
# Register parsing rules
123+
md.inline.ruler.before("escape", "math_inline", math_inline_func)
124+
md.block.ruler.before("fence", "math_block", math_block_func)
125+
126+
# Register renderers
127+
md.add_render_rule("math_inline", render_math_inline)
128+
md.add_render_rule("math_block", render_math_block)
129+
130+
131+
md = MarkdownIt("gfm-like").disable("linkify").use(texmath_to_mathml_plugin)
132+
133+
134+
def render_markdown(markdown):
135+
return md.render(markdown)

requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,5 @@ django-celery-results
3535
packaging>=21.0
3636
langcodes==3.5.0
3737
pydantic==2.11.5
38+
latex2mathml==3.78.0
39+
markdown-it-py==3.0.0

requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,16 @@ langcodes==3.5.0
160160
# via -r requirements.in
161161
language-data==1.3.0
162162
# via langcodes
163+
latex2mathml==3.78.0
164+
# via -r requirements.in
163165
le-utils==0.2.12
164166
# via -r requirements.in
165167
marisa-trie==1.2.1
166168
# via language-data
169+
markdown-it-py==3.0.0
170+
# via -r requirements.in
171+
mdurl==0.1.2
172+
# via markdown-it-py
167173
packaging==25.0
168174
# via
169175
# -r requirements.in

0 commit comments

Comments
 (0)