diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index 5f4d09d401..61bf1321d6 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -926,7 +926,7 @@ def derive_element_type_from_text(text: str) -> type[Text] | None: # ------------------------------------------------------------------------------------------------ -html_parser = etree.HTMLParser(remove_comments=True) +html_parser = etree.HTMLParser(remove_comments=True, huge_tree=True) # -- elements that don't have a registered class get DefaultElement -- fallback = etree.ElementDefaultClassLookup(element=DefaultElement) # -- elements that do have a registered class are assigned that class via lookup --