Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.22.22

### Fixes

- **Parse large/deeply nested HTML documents (opt-in)**: `partition_html` previously returned an empty element list for HTML with deep subtree nesting because the module-level `etree.HTMLParser` used lxml's default `huge_tree=False`, which silently drops nodes past the default depth limit. Set the `UNSTRUCTURED_HTML_HUGE_TREE` environment variable to `1`/`true`/`yes` to enable `huge_tree=True` and parse deeply nested documents. The default remains `False` because `huge_tree=True` disables libxml2's safety guards against malicious inputs (see https://lxml.de/FAQ.html) (#4289).

## 0.22.21

### Enhancements
Expand Down
66 changes: 66 additions & 0 deletions test_unstructured/partition/html/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,72 @@ def test_partition_html_accepts_an_html_str():
assert len(elements) > 0


def test_partition_html_huge_tree_defaults_to_disabled():
"""`UNSTRUCTURED_HTML_HUGE_TREE` defaults to off so libxml2 safety guards stay on.

`huge_tree=True` disables protections against malicious inputs (see
https://lxml.de/FAQ.html), so it must remain opt-in. This test asserts the
module-level parser is constructed with `huge_tree=False` when the env var is unset.
"""
# The libxml2 parser doesn't expose `huge_tree` directly, so assert by behavior:
# parsing a deeply-nested document without the env var should silently return [].
depth = 260
html = (
"<html><body>"
+ "<div>" * depth
+ "<p>deep</p>"
+ "</div>" * depth
+ "</body></html>"
)

elements = partition_html(text=html)

# With huge_tree disabled, lxml drops nodes past the depth limit.
assert elements == []


def test_partition_html_parses_deeply_nested_html_when_huge_tree_enabled(monkeypatch):
"""Regression for #4289: large/deeply-nested HTML must not silently yield zero elements.

lxml's ``HTMLParser`` defaults to ``huge_tree=False``, which causes subtrees beyond its
depth limit (~256) to be dropped silently. Setting ``UNSTRUCTURED_HTML_HUGE_TREE=1`` opts
into ``huge_tree=True`` on the module-level parser so ``partition_html`` returns the inner
text instead of an empty list. The opt-in is required because ``huge_tree=True`` disables
libxml2's safety guards (see https://lxml.de/FAQ.html).
"""
from unstructured.partition.html import parser as html_parser_module

monkeypatch.setenv("UNSTRUCTURED_HTML_HUGE_TREE", "1")
# The parser is built at module import time, so swap it in directly for the test.
original_parser = html_parser_module.html_parser
fresh_parser = etree.HTMLParser(remove_comments=True, huge_tree=True)
fresh_parser.set_element_class_lookup(html_parser_module.element_class_lookup)
monkeypatch.setattr(html_parser_module, "html_parser", fresh_parser)
# `partition.py` imported `html_parser` directly into its namespace, so patch that too.
from unstructured.partition.html import partition as partition_module

monkeypatch.setattr(partition_module, "html_parser", fresh_parser)

try:
depth = 260
html = (
"<html><body>"
+ "<div>" * depth
+ "<p>deeply nested paragraph</p>"
+ "</div>" * depth
+ "</body></html>"
)

elements = partition_html(text=html)

assert len(elements) == 1
assert elements[0].text == "deeply nested paragraph"
finally:
# Restore for any subsequent tests in this process.
monkeypatch.setattr(html_parser_module, "html_parser", original_parser)
monkeypatch.setattr(partition_module, "html_parser", original_parser)


def test_partition_html_accepts_a_url_to_an_HTML_document(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text=example_doc_text("example-10k-1p.html"),
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.21" # pragma: no cover
__version__ = "0.22.22" # pragma: no cover
9 changes: 8 additions & 1 deletion unstructured/partition/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@

from __future__ import annotations

import os
import re
from collections import defaultdict, deque
from functools import cached_property
Expand Down Expand Up @@ -945,7 +946,13 @@ def derive_element_type_from_text(text: str) -> type[Text] | None:
# ------------------------------------------------------------------------------------------------


html_parser = etree.HTMLParser(remove_comments=True)
# `huge_tree=True` allows lxml to parse deeply nested HTML (>256 levels) but
# disables libxml2's safety guards against malicious inputs. Default to off and
# require explicit opt-in via the `UNSTRUCTURED_HTML_HUGE_TREE` env var.
# See https://lxml.de/FAQ.html for the security tradeoffs.
_HUGE_TREE = os.environ.get("UNSTRUCTURED_HTML_HUGE_TREE", "").lower() in ("1", "true", "yes")

html_parser = etree.HTMLParser(remove_comments=True, huge_tree=_HUGE_TREE)
# -- elements that don't have a registered class get DefaultElement --
fallback = etree.ElementDefaultClassLookup(element=DefaultElement)
# -- elements that do have a registered class are assigned that class via lookup --
Expand Down