Skip to content

Commit b923bce

Browse files
authored
fix(crawler): switch to CommonMark-compliant markdown parser (#751)
1 parent 5ee5bef commit b923bce

6 files changed

Lines changed: 169 additions & 150 deletions

File tree

services/crawler/app/services/base_converter.py

Lines changed: 5 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,44 +10,11 @@
1010

1111
import asyncio
1212
import logging
13-
import re
1413

1514
from playwright.async_api import Browser, Page, async_playwright
1615

1716
logger = logging.getLogger(__name__)
1817

19-
_ATX_HEADING_RE = re.compile(r"^#{1,6}\s", re.MULTILINE)
20-
21-
22-
def _normalize_markdown_headings(text: str) -> str:
23-
"""Ensure blank lines before ATX headings for reliable parsing.
24-
25-
Skips content inside fenced code blocks (``` or ~~~).
26-
"""
27-
lines = text.split("\n")
28-
result: list[str] = []
29-
in_fence = False
30-
fence_marker = ""
31-
32-
for line in lines:
33-
stripped = line.strip()
34-
35-
if not in_fence:
36-
if stripped.startswith("```") or stripped.startswith("~~~"):
37-
in_fence = True
38-
fence_marker = stripped[:3]
39-
elif stripped == fence_marker or (stripped.startswith(fence_marker) and stripped.rstrip("`~") == ""):
40-
in_fence = False
41-
result.append(line)
42-
continue
43-
44-
if not in_fence and _ATX_HEADING_RE.match(line) and result and result[-1].strip():
45-
result.append("")
46-
47-
result.append(line)
48-
49-
return "\n".join(result)
50-
5118

5219
# Default HTML template for rendering content
5320
# Uses Noto fonts for multi-language support:
@@ -181,18 +148,8 @@ def _wrap_html(self, content: str, extra_head: str = "") -> str:
181148
return DEFAULT_HTML_TEMPLATE.format(content=content, extra_head=extra_head)
182149

183150
async def markdown_to_html(self, markdown: str) -> str:
184-
"""Convert markdown to HTML using Python-Markdown."""
185-
import markdown as md
186-
187-
normalized = _normalize_markdown_headings(markdown)
188-
189-
html = md.markdown(
190-
normalized,
191-
extensions=[
192-
"tables",
193-
"fenced_code",
194-
"codehilite",
195-
"toc",
196-
],
197-
)
198-
return html
151+
"""Convert markdown to HTML using markdown-it-py (CommonMark-compliant)."""
152+
from markdown_it import MarkdownIt
153+
154+
md = MarkdownIt("commonmark").enable("table")
155+
return md.render(markdown)

services/crawler/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ dependencies = [
1313
"python-dotenv==1.2.2",
1414
"loguru==0.7.3",
1515
"httpx==0.28.1",
16-
"markdown==3.10.2",
16+
"markdown-it-py>=3.0.0",
1717
"python-pptx==1.0.2",
1818
"python-docx==1.2.0",
1919
"pymupdf==1.27.2",

services/crawler/tests/test_markdown_normalize.py

Lines changed: 0 additions & 88 deletions
This file was deleted.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""Tests for markdown_to_html in base_converter.
2+
3+
Verifies CommonMark-compliant parsing via markdown-it-py, including
4+
leading-space headings, table interactions, and fenced code blocks.
5+
"""
6+
7+
import pytest
8+
9+
from app.services.base_converter import BaseConverterService
10+
11+
12+
@pytest.fixture
13+
def converter():
14+
return BaseConverterService()
15+
16+
17+
class TestHeadingParsing:
18+
"""ATX headings with various leading-space patterns."""
19+
20+
@pytest.mark.asyncio
21+
async def test_heading_no_leading_space(self, converter):
22+
html = await converter.markdown_to_html("# Heading")
23+
assert "<h1>" in html
24+
25+
@pytest.mark.asyncio
26+
@pytest.mark.parametrize("spaces", [1, 2, 3])
27+
async def test_heading_with_leading_spaces(self, converter, spaces):
28+
md = " " * spaces + "## Heading"
29+
html = await converter.markdown_to_html(md)
30+
assert "<h2>" in html
31+
32+
@pytest.mark.asyncio
33+
async def test_four_spaces_is_code_block(self, converter):
34+
html = await converter.markdown_to_html(" # Not a heading")
35+
assert "<h1>" not in html
36+
assert "<code>" in html
37+
38+
@pytest.mark.asyncio
39+
async def test_all_heading_levels(self, converter):
40+
md = "\n\n".join(f"{'#' * i} Level {i}" for i in range(1, 7))
41+
html = await converter.markdown_to_html(md)
42+
for i in range(1, 7):
43+
assert f"<h{i}>" in html
44+
45+
46+
class TestHeadingAfterTable:
47+
"""Headings following tables — the original bug that motivated normalization."""
48+
49+
@pytest.mark.asyncio
50+
async def test_heading_after_table_no_blank_line(self, converter):
51+
md = "| a | b |\n|---|---|\n| c | d |\n### Heading"
52+
html = await converter.markdown_to_html(md)
53+
assert "<h3>" in html
54+
assert "<table>" in html
55+
56+
@pytest.mark.asyncio
57+
async def test_heading_after_table_with_blank_line(self, converter):
58+
md = "| a | b |\n|---|---|\n| c | d |\n\n### Heading"
59+
html = await converter.markdown_to_html(md)
60+
assert "<h3>" in html
61+
assert "<table>" in html
62+
63+
@pytest.mark.asyncio
64+
async def test_multiple_tables_with_headings(self, converter):
65+
md = "### A\n\n| h1 | h2 |\n|---|---|\n| c1 | c2 |\n### B\n\n| h3 | h4 |\n|---|---|\n| c3 | c4 |"
66+
html = await converter.markdown_to_html(md)
67+
assert html.count("</h3>") == 2
68+
assert html.count("</table>") == 2
69+
70+
71+
class TestFencedCodeBlocks:
72+
"""Hash characters inside code fences must not become headings."""
73+
74+
@pytest.mark.asyncio
75+
async def test_hash_in_backtick_fence(self, converter):
76+
md = "```python\n# comment\ndef foo():\n pass\n```"
77+
html = await converter.markdown_to_html(md)
78+
assert "<h1>" not in html
79+
assert "# comment" in html
80+
81+
@pytest.mark.asyncio
82+
async def test_hash_in_tilde_fence(self, converter):
83+
md = "~~~\n# not a heading\n~~~"
84+
html = await converter.markdown_to_html(md)
85+
assert "<h1>" not in html
86+
87+
@pytest.mark.asyncio
88+
async def test_heading_after_code_block(self, converter):
89+
md = "```\ncode\n```\n## Heading"
90+
html = await converter.markdown_to_html(md)
91+
assert "<h2>" in html
92+
93+
94+
class TestInlineFormatting:
95+
"""Bold, italic, and other inline syntax."""
96+
97+
@pytest.mark.asyncio
98+
async def test_bold_text(self, converter):
99+
html = await converter.markdown_to_html("**bold text**")
100+
assert "<strong>" in html
101+
102+
@pytest.mark.asyncio
103+
async def test_italic_text(self, converter):
104+
html = await converter.markdown_to_html("*italic*")
105+
assert "<em>" in html
106+
107+
108+
class TestRealisticLLMOutput:
109+
"""End-to-end test with actual LLM output patterns."""
110+
111+
@pytest.mark.asyncio
112+
async def test_contract_comparison_report(self, converter):
113+
md = """ # Bericht zum Vertragsvergleich
114+
115+
**Dokumentenversionen:** file1.docx → file2.docx → file3.docx
116+
117+
**Anzahl analysierter Versionstransitionen:** 2
118+
119+
---
120+
121+
# Zentrale Erkenntnisse
122+
123+
### Verhandlungsverlauf
124+
125+
Der Verhandlungsprozess durchläuft einen fundamentalen Strukturwandel.
126+
127+
### Risikoverlagerungen
128+
129+
Die Risikoallokation erfährt eine bemerkenswerte Pendelbewegung.
130+
131+
### Methodik
132+
133+
Deterministischer Textvergleich auf Absatzebene."""
134+
135+
html = await converter.markdown_to_html(md)
136+
assert html.count("<h1>") == 2
137+
assert html.count("<h3>") == 3
138+
assert "<strong>" in html
139+
assert "<hr" in html
140+
141+
@pytest.mark.asyncio
142+
async def test_frequency_table_section(self, converter):
143+
md = """ ### Am häufigsten verhandelte Klauseln
144+
145+
| Clause Family | Substantive | Editorial | Total |
146+
|---|---|---|---|
147+
| Allgemeine Bestimmungen | 6 | 3 | 11 |
148+
| Schadloshaltungen | 8 | 0 | 8 |
149+
150+
---
151+
152+
# Detaillierte Evolutionsanalyse"""
153+
154+
html = await converter.markdown_to_html(md)
155+
assert "<h3>" in html
156+
assert "<table>" in html
157+
assert "<h1>" in html
158+
assert "<hr" in html

services/crawler/uv.lock

Lines changed: 2 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

services/platform/convex/workflow_engine/helpers/nodes/llm/execute_agent_with_tools.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,9 @@ async function executeTextOutput(
363363
);
364364

365365
const { agentSteps, toolDiagnostics } = processAgentResult(result);
366-
const outputText =
367-
isRecord(result) && typeof result.text === 'string' ? result.text : '';
366+
const outputText = (
367+
isRecord(result) && typeof result.text === 'string' ? result.text : ''
368+
).trim();
368369

369370
if (!outputText || !outputText.trim()) {
370371
const stepsCount = Array.isArray(agentSteps) ? agentSteps.length : 0;

0 commit comments

Comments
 (0)