Skip to content

Commit 604bba1

Browse files
jigangzjigangz
andauthored
fix: handle deeply nested HTML that triggers RecursionError (#1644)
* fix: handle deeply nested HTML that triggers RecursionError (#1636) Large HTML files with deep DOM nesting (e.g., SEC EDGAR filings) cause markdownify's recursive DOM traversal to exceed Python's default recursion limit (1000). Previously this RecursionError was caught by the top-level _convert() dispatcher, which then fell through to PlainTextConverter — silently returning the raw HTML as 'markdown' with no warning. This fix catches RecursionError in HtmlConverter.convert() and falls back to BeautifulSoup's iterative get_text() method, which handles arbitrary nesting depths. A warning is emitted so callers know the output is plain text rather than full markdown. Root cause chain: 1. HtmlConverter.convert() calls markdownify.convert_soup() (recursive) 2. Deeply nested HTML (>~400 levels) triggers RecursionError 3. _convert() catches all Exceptions, stores in failed_attempts 4. PlainTextConverter.accepts() matches text/html via 'text/' prefix 5. PlainTextConverter.convert() returns raw HTML bytes as text 6. Caller receives 'markdown' that is actually unconverted HTML * refactor: address review feedback on RecursionError fallback - Move 'import warnings' to module top level (was inside except block) - Make test environment-independent by temporarily lowering sys.setrecursionlimit(200) instead of relying on depth=500 being sufficient on all platforms; original limit restored in finally block - Add strict=True keyword argument to opt out of the plain-text fallback and let RecursionError propagate to the caller * test: use result.markdown instead of deprecated result.text_content --------- Co-authored-by: jigangz <jigangz@github.com>
1 parent 63cbbd9 commit 604bba1

2 files changed

Lines changed: 74 additions & 4 deletions

File tree

packages/markitdown/src/markitdown/converters/_html_converter.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
import warnings
23
from typing import Any, BinaryIO, Optional
34
from bs4 import BeautifulSoup
45

@@ -44,6 +45,10 @@ def convert(
4445
stream_info: StreamInfo,
4546
**kwargs: Any, # Options to pass to the converter
4647
) -> DocumentConverterResult:
48+
# Pop our own keyword before forwarding the rest to markdownify.
49+
# strict=True raises RecursionError instead of falling back to plain text.
50+
strict: bool = kwargs.pop("strict", False)
51+
4752
# Parse the stream
4853
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
4954
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
@@ -55,10 +60,25 @@ def convert(
5560
# Print only the main content
5661
body_elm = soup.find("body")
5762
webpage_text = ""
58-
if body_elm:
59-
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
60-
else:
61-
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
63+
try:
64+
if body_elm:
65+
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
66+
else:
67+
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
68+
except RecursionError:
69+
if strict:
70+
raise
71+
# Large or deeply-nested HTML can exceed Python's recursion limit
72+
# during markdownify's recursive DOM traversal. Fall back to
73+
# BeautifulSoup's iterative get_text() so the caller still gets
74+
# usable plain-text content instead of raw HTML.
75+
warnings.warn(
76+
"HTML document is too deeply nested for markdown conversion "
77+
"(RecursionError). Falling back to plain-text extraction.",
78+
stacklevel=2,
79+
)
80+
target = body_elm if body_elm else soup
81+
webpage_text = target.get_text("\n", strip=True)
6282

6383
assert isinstance(webpage_text, str)
6484

packages/markitdown/tests/test_module_misc.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,56 @@ def test_input_as_strings() -> None:
288288
assert "# Test" in result.text_content
289289

290290

291+
def test_deeply_nested_html_fallback() -> None:
292+
"""Large, deeply nested HTML should fall back to plain-text extraction
293+
instead of silently returning unconverted HTML (issue #1636).
294+
295+
Note: This test uses sys.setrecursionlimit to guarantee a RecursionError
296+
regardless of the host environment's default limit, making it deterministic
297+
across different platforms and CI configurations.
298+
"""
299+
import sys
300+
import warnings
301+
302+
markitdown = MarkItDown()
303+
304+
# Use a small recursion limit so the test is environment-independent.
305+
# We restore the original limit in a finally block to avoid side-effects.
306+
original_limit = sys.getrecursionlimit()
307+
low_limit = 200 # well below markdownify's traversal depth for depth=500
308+
309+
# Build HTML with nesting deep enough to trigger RecursionError
310+
depth = 500
311+
html = "<html><body>"
312+
for _ in range(depth):
313+
html += '<div style="margin-left:10px">'
314+
html += "<p>Deep content with <b>bold text</b></p>"
315+
for _ in range(depth):
316+
html += "</div>"
317+
html += "</body></html>"
318+
319+
try:
320+
sys.setrecursionlimit(low_limit)
321+
with warnings.catch_warnings(record=True) as w:
322+
warnings.simplefilter("always")
323+
result = markitdown.convert_stream(
324+
io.BytesIO(html.encode("utf-8")),
325+
file_extension=".html",
326+
)
327+
328+
# Should have emitted a warning about the fallback
329+
recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
330+
assert len(recursion_warnings) > 0
331+
finally:
332+
sys.setrecursionlimit(original_limit)
333+
334+
# The output should contain the text content, not raw HTML
335+
assert "Deep content" in result.markdown
336+
assert "bold text" in result.markdown
337+
assert "<div" not in result.markdown
338+
assert "<p>" not in result.markdown
339+
340+
291341
def test_doc_rlink() -> None:
292342
# Test for: CVE-2025-11849
293343
markitdown = MarkItDown()

0 commit comments

Comments
 (0)