Unstructured-IO · CrepuscularIRIS · Apr 16, 2026 · Apr 19, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,32 +1,8 @@
-## 0.22.26
-
-### Enhancements
-
-- Add `table_extraction_method` field to `ElementMetadata` to track which algorithm produced a table (grid, tatr, vlm). Propagated from `LayoutElement` during PDF partitioning.
-
-## 0.22.25
-
-### Enhancements
-
-- **`unstructured doctor` CLI**: Add a `unstructured` console script and `python -m unstructured` entry point with a `doctor` subcommand for dependency and capability diagnostics (environment, optional system tools such as libmagic, tesseract, pandoc, ffmpeg, and LibreOffice, and per file-type extras). Supports `doctor --for <type>` (including `image` and `audio` families) and `doctor --file <path>`; exits non-zero when the requested capability is not available.
-
-## 0.22.24
-
-### Fixes
-
-- **Reject oversized hi-res PDF renders before bitmap allocation**: Hi-res PDF partitioning now passes the configured per-page pixel limit to `unstructured-inference` so oversized pages are rejected immediately before rendering, then returned as unprocessable document errors.
-
-## 0.22.23
-
-### Fixes
-
-- **Preserve `colspan`/`rowspan` in first table chunk headers**: `HtmlTable` compactification no longer strips `colspan` and `rowspan` attributes from table cells. Previously, the first `TableChunk` lost merged-cell structural information while continuation chunks retained it (via the source-HTML path used for repeated headers), yielding inconsistent header layout across a split table.
-
 ## 0.22.22
 
-### Security
+### Fixes
 
-- **Replace PyPI opencv wheels with ffmpeg-free builds in Docker image**: After `uv sync`, the Dockerfile now substitutes all PyPI opencv-python variants with a source-built `opencv-contrib-python-headless` wheel compiled with `WITH_FFMPEG=OFF`, eliminating 14 bundled ffmpeg CVEs. The contrib-headless variant is a strict superset of the cv2 API (core + contrib modules, no GUI) so a single wheel replaces `opencv-python`, `opencv-python-headless`, and `opencv-contrib-python`.
+- **Parse large/deeply nested HTML documents (opt-in)**: `partition_html` previously returned an empty element list for HTML with deep subtree nesting because the module-level `etree.HTMLParser` used lxml's default `huge_tree=False`, which silently drops nodes past the default depth limit. Set the `UNSTRUCTURED_HTML_HUGE_TREE` environment variable to `1`/`true`/`yes` to enable `huge_tree=True` and parse deeply nested documents. The default remains `False` because `huge_tree=True` disables libxml2's safety guards against malicious inputs (see https://lxml.de/FAQ.html) (#4289).
 
 ## 0.22.21
 

diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py
@@ -108,6 +108,72 @@ def test_partition_html_accepts_an_html_str():
     assert len(elements) > 0
 
 
+def test_partition_html_huge_tree_defaults_to_disabled():
+    """`UNSTRUCTURED_HTML_HUGE_TREE` defaults to off so libxml2 safety guards stay on.
+
+    `huge_tree=True` disables protections against malicious inputs (see
+    https://lxml.de/FAQ.html), so it must remain opt-in. This test asserts the
+    module-level parser is constructed with `huge_tree=False` when the env var is unset.
+    """
+    # The libxml2 parser doesn't expose `huge_tree` directly, so assert by behavior:
+    # parsing a deeply-nested document without the env var should silently return [].
+    depth = 260
+    html = (
+        "<html><body>"
+        + "<div>" * depth
+        + "<p>deep</p>"
+        + "</div>" * depth
+        + "</body></html>"
+    )
+
+    elements = partition_html(text=html)
+
+    # With huge_tree disabled, lxml drops nodes past the depth limit.
+    assert elements == []
+
+
+def test_partition_html_parses_deeply_nested_html_when_huge_tree_enabled(monkeypatch):
+    """Regression for #4289: large/deeply-nested HTML must not silently yield zero elements.
+
+    lxml's ``HTMLParser`` defaults to ``huge_tree=False``, which causes subtrees beyond its
+    depth limit (~256) to be dropped silently. Setting ``UNSTRUCTURED_HTML_HUGE_TREE=1`` opts
+    into ``huge_tree=True`` on the module-level parser so ``partition_html`` returns the inner
+    text instead of an empty list. The opt-in is required because ``huge_tree=True`` disables
+    libxml2's safety guards (see https://lxml.de/FAQ.html).
+    """
+    from unstructured.partition.html import parser as html_parser_module
+
+    monkeypatch.setenv("UNSTRUCTURED_HTML_HUGE_TREE", "1")
+    # The parser is built at module import time, so swap it in directly for the test.
+    original_parser = html_parser_module.html_parser
+    fresh_parser = etree.HTMLParser(remove_comments=True, huge_tree=True)
+    fresh_parser.set_element_class_lookup(html_parser_module.element_class_lookup)
+    monkeypatch.setattr(html_parser_module, "html_parser", fresh_parser)
+    # `partition.py` imported `html_parser` directly into its namespace, so patch that too.
+    from unstructured.partition.html import partition as partition_module
+
+    monkeypatch.setattr(partition_module, "html_parser", fresh_parser)
+
+    try:
+        depth = 260
+        html = (
+            "<html><body>"
+            + "<div>" * depth
+            + "<p>deeply nested paragraph</p>"
+            + "</div>" * depth
+            + "</body></html>"
+        )
+
+        elements = partition_html(text=html)
+
+        assert len(elements) == 1
+        assert elements[0].text == "deeply nested paragraph"
+    finally:
+        # Restore for any subsequent tests in this process.
+        monkeypatch.setattr(html_parser_module, "html_parser", original_parser)
+        monkeypatch.setattr(partition_module, "html_parser", original_parser)
+
+
 def test_partition_html_accepts_a_url_to_an_HTML_document(requests_get_: Mock):
     requests_get_.return_value = FakeResponse(
         text=example_doc_text("example-10k-1p.html"),

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.26"  # pragma: no cover
+__version__ = "0.22.22"  # pragma: no cover
diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py
@@ -75,6 +75,7 @@
 
 from __future__ import annotations
 
+import os
 import re
 from collections import defaultdict, deque
 from functools import cached_property
@@ -945,7 +946,13 @@ def derive_element_type_from_text(text: str) -> type[Text] | None:
 # ------------------------------------------------------------------------------------------------
 
 
-html_parser = etree.HTMLParser(remove_comments=True)
+# `huge_tree=True` allows lxml to parse deeply nested HTML (>256 levels) but
+# disables libxml2's safety guards against malicious inputs. Default to off and
+# require explicit opt-in via the `UNSTRUCTURED_HTML_HUGE_TREE` env var.
+# See https://lxml.de/FAQ.html for the security tradeoffs.
+_HUGE_TREE = os.environ.get("UNSTRUCTURED_HTML_HUGE_TREE", "").lower() in ("1", "true", "yes")
+
+html_parser = etree.HTMLParser(remove_comments=True, huge_tree=_HUGE_TREE)
 # -- elements that don't have a registered class get DefaultElement --
 fallback = etree.ElementDefaultClassLookup(element=DefaultElement)
 # -- elements that do have a registered class are assigned that class via lookup --
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.22.26" # pragma: no cover
		__version__ = "0.22.22" # pragma: no cover