Add utilities for parsing markdown to HTML/MathML.

rtibbles · rtibbles · commit 9e25d56085c8 · 2025-06-30T07:42:21.000-07:00
diff --git a/contentcuration/contentcuration/tests/utils/test_markdown.py b/contentcuration/contentcuration/tests/utils/test_markdown.py
@@ -0,0 +1,200 @@
+import unittest
+
+from contentcuration.utils.assessment.markdown import render_markdown
+from contentcuration.utils.assessment.qti import ElementTreeBase
+
+
+class TexMathTestMixin:
+    """Mixin providing test methods for TexMath plugin tests"""
+
+    maxDiff = None
+
+    def _assert_conversion(self, markdown_text: str, expected: str):
+        """Override in subclasses to define assertion behavior"""
+        raise NotImplementedError("Subclasses must implement _assert_conversion")
+
+    def test_markdown_with_inline_math(self):
+        """Test conversion of markdown with inline math to HTML + MathML"""
+
+        markdown_text = (
+            "What is the answer to this *question*? $$x\cdot y=z^2$$"  # noqa W605
+        )
+        expected = (
+            "<p>What is the answer to this <em>question</em>? "
+            '<math display="inline">'
+            "<semantics><mrow><mi>x</mi><mi>·</mi><mi>y</mi><mo>=</mo><msup><mi>z</mi><mn>2</mn></msup></mrow>"
+            '<annotation encoding="application/x-tex">x\cdot y=z^2</annotation></semantics>'  # noqa W605
+            "</math></p>\n"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_block_math(self):
+        """Test conversion of block math"""
+
+        markdown_text = (
+            "Here's an equation:\n\n$$E = mc^2$$\n\nThat's Einstein's formula."
+        )
+        expected = (
+            "<p>Here's an equation:</p>\n"
+            '<math display="block">'
+            "<semantics><mrow><mi>E</mi><mo>=</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow>"
+            '<annotation encoding="application/x-tex">E = mc^2</annotation></semantics>'
+            "</math>"
+            "<p>That's Einstein's formula.</p>\n"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_multiline_block_math(self):
+        """
+        Ensure a $$ … $$ block spanning multiple lines is converted to MathML
+        and the literal $$ delimiters are removed. This currently fails with
+        the buggy BLOCK_PATTERN because it stops after the first '$'.
+        """
+        markdown_text = (
+            "$$\n"
+            "\\begin{aligned}\n"
+            "a = b + c \\\\\n"
+            "$5 = d + e\n"
+            "\\end{aligned}\n"
+            "$$"
+        )
+        expected = (
+            '<math display="block">'
+            "<semantics><mrow><mrow><mi>a</mi><mo>=</mo><mi>b</mi><mo>+</mo><mi>c</mi>"
+            '<mspace linebreak="newline" /><mi>$</mi><mn>5</mn><mo>=</mo><mi>d</mi><mo>+</mo><mi>e</mi></mrow></mrow>'
+            '<annotation encoding="application/x-tex">\n\\begin{aligned}\na = b + c \\\\\n$5 = d + e\n\\end{aligned}\n</annotation></semantics>'
+            "</math>"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_inline_math_with_dollar_inside(self):
+        """
+        Ensure a $$ … $$ inline that contains an internal '$' (e.g. inside
+        \\text{}) is parsed correctly. With the old BLOCK_PATTERN the first '$'
+        prematurely terminates the match so the delimiters remain.
+        """
+        markdown_text = "Test this $$\\text{Cost = 1.00 $USD$}$$"
+        expected = (
+            "<p>Test this "
+            '<math display="inline">'
+            "<semantics><mrow><mtext>Cost = 1.00 $USD$</mtext></mrow>"
+            '<annotation encoding="application/x-tex">\\text{Cost = 1.00 $USD$}</annotation></semantics>'
+            "</math></p>\n"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_multiple_math_expressions(self):
+        """Test multiple math expressions in one document"""
+
+        markdown_text = "First: $$a + b$$, then $$c \\times d$$, finally $$e^f$$."
+        expected = (
+            "<p>First: "
+            '<math display="inline"><semantics><mrow><mi>a</mi><mo>+</mo><mi>b</mi></mrow>'
+            '<annotation encoding="application/x-tex">a + b</annotation></semantics></math>'
+            ", then "
+            '<math display="inline"><semantics><mrow><mi>c</mi><mi>×</mi><mi>d</mi></mrow>'
+            '<annotation encoding="application/x-tex">c \\times d</annotation></semantics></math>'
+            ", finally "
+            '<math display="inline"><semantics><mrow><msup><mi>e</mi><mi>f</mi></msup></mrow>'
+            '<annotation encoding="application/x-tex">e^f</annotation></semantics></math>'
+            ".</p>\n"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_mixed_inline_and_block(self):
+        """Test document with both inline and block math"""
+
+        markdown_text = (
+            "This is inline math: $$a = b$$\n\n"
+            "And this is block math:\n\n"
+            "$$\\sum_{i=1}^{n} x_i = y$$\n\n"
+            "Back to text with more inline: $$z^2$$"
+        )
+        expected = (
+            "<p>This is inline math: "
+            '<math display="inline"><semantics><mrow><mi>a</mi><mo>=</mo><mi>b</mi></mrow>'
+            '<annotation encoding="application/x-tex">a = b</annotation></semantics></math>'
+            "</p>\n"
+            "<p>And this is block math:</p>\n"
+            '<math display="block">'
+            "<semantics><mrow><msubsup><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mrow>"
+            "<mi>n</mi></mrow></msubsup><msub><mi>x</mi><mi>i</mi></msub><mo>=</mo><mi>y</mi></mrow>"
+            '<annotation encoding="application/x-tex">\sum_{i=1}^{n} x_i = y</annotation></semantics>'  # noqa W605
+            "</math>"
+            "<p>Back to text with more inline: "
+            '<math display="inline"><semantics><mrow><msup><mi>z</mi><mn>2</mn></msup></mrow>'
+            '<annotation encoding="application/x-tex">z^2</annotation></semantics></math>'
+            "</p>\n"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_no_math_content(self):
+        """Test that regular markdown without math still works"""
+
+        markdown_text = "This is just *regular* markdown with **bold** text."
+        expected = "<p>This is just <em>regular</em> markdown with <strong>bold</strong> text.</p>\n"
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_simple_inline_math(self):
+        """Test simple inline math expression"""
+
+        markdown_text = "The variable $$x$$ is unknown."
+        expected = (
+            "<p>The variable "
+            '<math display="inline"><semantics><mrow><mi>x</mi></mrow>'
+            '<annotation encoding="application/x-tex">x</annotation></semantics></math>'
+            " is unknown.</p>\n"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+    def test_simple_block_math(self):
+        """Test simple block math expression"""
+
+        markdown_text = "$$y = mx + b$$"
+        expected = (
+            '<math display="block">'
+            "<semantics><mrow><mi>y</mi><mo>=</mo><mi>m</mi><mi>x</mi><mo>+</mo><mi>b</mi></mrow>"
+            '<annotation encoding="application/x-tex">y = mx + b</annotation></semantics>'
+            "</math>"
+        )
+
+        self._assert_conversion(markdown_text, expected)
+
+
+class TestTexMathPlugin(TexMathTestMixin, unittest.TestCase):
+    """Test direct markdown conversion: markdown → HTML+MathML"""
+
+    def _assert_conversion(self, markdown_text: str, expected: str):
+        """Test direct markdown to HTML+MathML conversion"""
+        result = render_markdown(markdown_text)
+        self.assertEqual(result, expected)
+
+
+class TestTexMathPluginRoundtrip(TexMathTestMixin, unittest.TestCase):
+    """Test full roundtrip: markdown → HTML+MathML → Pydantic → string"""
+
+    maxDiff = None
+
+    def _assert_conversion(self, markdown_text: str, expected: str):
+        """Test full roundtrip conversion via Pydantic objects"""
+        result = render_markdown(markdown_text)
+
+        # Parse to Pydantic objects and back to string
+        parsed = ElementTreeBase.from_string(result)
+        roundtrip_result = (
+            "".join(e.to_xml_string().strip() for e in parsed)
+            if isinstance(parsed, list)
+            else parsed.to_xml_string().strip()
+        )
+        self.assertEqual(
+            roundtrip_result.replace("\n", "").strip(),
+            expected.replace("\n", "").strip(),
+        )
diff --git a/contentcuration/contentcuration/utils/assessment/markdown.py b/contentcuration/contentcuration/utils/assessment/markdown.py
@@ -0,0 +1,135 @@
+import re
+import xml.etree.ElementTree as ET
+
+from latex2mathml.converter import convert
+from markdown_it import MarkdownIt
+from markdown_it.renderer import RendererProtocol
+from markdown_it.rules_block import StateBlock
+from markdown_it.rules_inline import StateInline
+from markdown_it.token import Token
+from markdown_it.utils import EnvType
+from markdown_it.utils import OptionsDict
+
+from contentcuration.utils.assessment.qti.mathml.core import Annotation
+from contentcuration.utils.assessment.qti.mathml.core import Semantics
+
+
+# Regex patterns for $$ delimited math
+INLINE_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$")
+BLOCK_PATTERN = re.compile(r"^\$\$([\s\S]+?)\$\$", re.M)
+
+
+def math_inline_func(state: StateInline, silent: bool) -> bool:
+    """Parse inline math: $$expression$$"""
+    if not state.src.startswith("$$", state.pos):
+        return False
+
+    match = INLINE_PATTERN.match(state.src[state.pos :])
+    if not match:
+        return False
+
+    if not silent:
+        token = state.push("math_inline", "math", 0)
+        token.content = match.group(1)
+        token.markup = "$$"
+
+    state.pos += match.end()
+    return True
+
+
+def math_block_func(
+    state: StateBlock, begLine: int, endLine: int, silent: bool
+) -> bool:
+    """Parse block math: $$expression$$"""
+    begin = state.bMarks[begLine] + state.tShift[begLine]
+
+    if not state.src.startswith("$$", begin):
+        return False
+
+    match = BLOCK_PATTERN.match(state.src[begin:])
+    if not match:
+        return False
+
+    if not silent:
+        token = state.push("math_block", "math", 0)
+        token.block = True
+        token.content = match.group(1)
+        token.markup = "$$"
+
+    # Advance to next line after the math block
+    endpos = begin + match.end() - 1
+    line = begLine
+    while line < endLine:
+        if endpos >= state.bMarks[line] and endpos <= state.eMarks[line]:
+            state.line = line + 1
+            break
+        line += 1
+
+    return True
+
+
+def _convert(latex, inline=True):
+    # Remove the namespace declaration for cleaner output
+    markup = convert(latex, display="inline" if inline else "block").replace(
+        ' xmlns="http://www.w3.org/1998/Math/MathML"', ""
+    )
+    # By default latex2mathml encodes operators that don't need to be encoded
+    # so we parse it with ElementTree and turn it back into a string here for consistency.
+    math_element = ET.fromstring(markup)
+
+    # Create LaTeX annotation
+    latex_annotation_element = Annotation(
+        encoding="application/x-tex", children=[latex]
+    ).to_element()
+
+    semantics_element = Semantics().to_element()
+    for child in math_element:
+        math_element.remove(child)
+        semantics_element.append(child)
+    semantics_element.append(latex_annotation_element)
+    math_element.append(semantics_element)
+
+    return ET.tostring(math_element, encoding="unicode")
+
+
+def render_math_inline(
+    self: RendererProtocol,
+    tokens: list[Token],
+    idx: int,
+    options: OptionsDict,
+    env: EnvType,
+) -> str:
+    """Render inline math to MathML"""
+    return _convert(tokens[idx].content)
+
+
+def render_math_block(
+    self: RendererProtocol,
+    tokens: list[Token],
+    idx: int,
+    options: OptionsDict,
+    env: EnvType,
+) -> str:
+    """Render block math to MathML"""
+    return _convert(tokens[idx].content, inline=False)
+
+
+def texmath_to_mathml_plugin(md: MarkdownIt) -> None:
+    """Simple plugin for parsing TeX math with $$ delimiters.
+
+    Converts inline and block math expressions to MathML using latex2mathml.
+    """
+    # Register parsing rules
+    md.inline.ruler.before("escape", "math_inline", math_inline_func)
+    md.block.ruler.before("fence", "math_block", math_block_func)
+
+    # Register renderers
+    md.add_render_rule("math_inline", render_math_inline)
+    md.add_render_rule("math_block", render_math_block)
+
+
+md = MarkdownIt("gfm-like").disable("linkify").use(texmath_to_mathml_plugin)
+
+
+def render_markdown(markdown):
+    return md.render(markdown)
diff --git a/requirements.in b/requirements.in
@@ -35,3 +35,5 @@ django-celery-results
 packaging>=21.0
 langcodes==3.5.0
 pydantic==2.11.5
+latex2mathml==3.78.0
+markdown-it-py==3.0.0
diff --git a/requirements.txt b/requirements.txt
@@ -160,10 +160,16 @@ langcodes==3.5.0
     # via -r requirements.in
 language-data==1.3.0
     # via langcodes
+latex2mathml==3.78.0
+    # via -r requirements.in
 le-utils==0.2.12
     # via -r requirements.in
 marisa-trie==1.2.1
     # via language-data
+markdown-it-py==3.0.0
+    # via -r requirements.in
+mdurl==0.1.2
+    # via markdown-it-py
 packaging==25.0
     # via
     #   -r requirements.in