From 6c22befed402016338f909ef4e1431328ea7a943 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 4 Jun 2025 16:11:28 +0200 Subject: [PATCH 1/9] Remove IDs from HTML code --- unstructured/documents/mappings.py | 16 +++---- unstructured/documents/ontology.py | 43 ++++++++++--------- .../partition/html/transformations.py | 22 +++++----- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/unstructured/documents/mappings.py b/unstructured/documents/mappings.py index a006409027..ce05364f68 100644 --- a/unstructured/documents/mappings.py +++ b/unstructured/documents/mappings.py @@ -5,13 +5,13 @@ of parsed documents """ -from typing import Any, Dict, Type +from typing import Dict, Type from unstructured.documents import elements, ontology from unstructured.documents.elements import Element -def get_all_subclasses(cls) -> list[Any]: +def get_all_subclasses(cls: type) -> list[type]: """ Recursively find all subclasses of a given class. @@ -19,7 +19,7 @@ def get_all_subclasses(cls) -> list[Any]: cls (type): The class for which to find all subclasses. Returns: - list: A list of all subclasses of the given class. + list[type]: A list of all subclasses of the given class. """ subclasses = cls.__subclasses__() all_subclasses = subclasses.copy() @@ -30,7 +30,9 @@ def get_all_subclasses(cls) -> list[Any]: return all_subclasses -def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]: +def get_ontology_to_unstructured_type_mapping() -> ( + dict[Type[ontology.OntologyElement], Type[Element]] +): """ Get a mapping of ontology element to unstructured type. @@ -50,7 +52,7 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]: dict: A dictionary where keys are ontology element classes and values are unstructured types. """ - ontology_to_unstructured_class_mapping = { + ontology_to_unstructured_class_mapping: Dict[Type[ontology.OntologyElement], Type[Element]] = { ontology.Document: elements.Text, ontology.Section: elements.Text, ontology.Page: elements.Text, @@ -134,9 +136,7 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]: for tag in element_type().allowed_tags } CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = { - element_type().css_class_name: element_type - for element_type in ALL_ONTOLOGY_ELEMENT_TYPES - for tag in element_type().allowed_tags + element_type().css_class_name: element_type for element_type in ALL_ONTOLOGY_ELEMENT_TYPES } HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = { diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index 1eaf3b771e..fd69ead621 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -18,9 +18,9 @@ import uuid from copy import copy from enum import Enum -from typing import List, Optional +from typing import Any, List, Optional -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup from pydantic import BaseModel, Field @@ -49,19 +49,19 @@ class OntologyElement(BaseModel): html_tag_name: Optional[str] = Field( default_factory=lambda: "", description="HTML Tag name associated with the element" ) - elementType: ElementTypeEnum = Field(..., description="Type of the element") - children: List["OntologyElement"] = Field( + elementType: ElementTypeEnum = Field(description="Type of the element") + children: list["OntologyElement"] = Field( default_factory=list, description="List of child elements" ) - description: str = Field(..., description="Description of the element") - allowed_tags: List[str] = Field(..., description="HTML tags associated with the element") + description: str = Field(description="Description of the element") + allowed_tags: list[str] = Field(description="HTML tags associated with the element") - additional_attributes: Optional[dict] = Field( - {}, description="Optional HTML attributes or CSS properties" + additional_attributes: dict[str, Any] = Field( + default_factory=dict, description="Optional HTML attributes or CSS properties" ) - def __init__(self, **kwargs): + def __init__(self, **kwargs: dict[str, Any]): super().__init__(**kwargs) if self.css_class_name == "": # if None, then do not set self.css_class_name = self.__class__.__name__ @@ -74,9 +74,10 @@ def __init__(self, **kwargs): def generate_unique_id() -> str: return str(uuid.uuid4()).replace("-", "") - def to_html(self, add_children=True) -> str: + def to_html(self, add_children: bool = True) -> str: additional_attrs = copy(self.additional_attributes) additional_attrs.pop("class", None) + additional_attrs.pop("id", None) attr_str = self._construct_attribute_string(additional_attrs) class_attr = f'class="{self.css_class_name}"' if self.css_class_name else "" @@ -89,7 +90,7 @@ def to_html(self, add_children=True) -> str: return result_html - def to_text(self, add_children=True, add_img_alt_text=True) -> str: + def to_text(self, add_children: bool = True, add_img_alt_text: bool = True) -> str: """ Returns the text representation of the element. @@ -111,7 +112,7 @@ def to_text(self, add_children=True, add_img_alt_text=True) -> str: return text.strip() - def _construct_attribute_string(self, attributes: dict) -> str: + def _construct_attribute_string(self, attributes: dict[str, str]) -> str: return " ".join( f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items() ) @@ -138,18 +139,20 @@ def id(self) -> str | None: def page_number(self) -> int | None: if "data-page-number" in self.additional_attributes: try: - return int(self.additional_attributes.get("data-page-number")) + page_attr = self.additional_attributes.get("data-page-number") + if page_attr is not None: + return int(page_attr) except ValueError: return None return None -def remove_ids_and_class_from_table(soup: Tag): +def remove_ids_and_class_from_table(soup: BeautifulSoup): for tag in soup.find_all(True): - if tag.name == "table": + if tag.name == "table": # type: ignore continue # We keep table tag - tag.attrs.pop("class", None) - tag.attrs.pop("id", None) + tag.attrs.pop("class", None) # type: ignore + tag.attrs.pop("id", None) # type: ignore return soup @@ -291,7 +294,7 @@ class Table(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["table"], frozen=True) - def to_html(self, add_children=True) -> str: + def to_html(self, add_children: bool = True) -> str: soup = BeautifulSoup(super().to_html(add_children), "html.parser") soup = remove_ids_and_class_from_table(soup) return str(soup) @@ -460,7 +463,7 @@ class TableOfContents(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["table"], frozen=True) - def to_html(self, add_children=True) -> str: + def to_html(self, add_children: bool = True) -> str: soup = BeautifulSoup(super().to_html(add_children), "html.parser") soup = remove_ids_and_class_from_table(soup) return str(soup) @@ -489,7 +492,7 @@ class FormFieldValue(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True) - def to_text(self, add_children=True, add_img_alt_text=True) -> str: + def to_text(self, add_children: bool = True, add_img_alt_text: bool = True) -> str: text = super().to_text(add_children, add_img_alt_text) value = self.additional_attributes.get("value", "") if not value: diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 71a4982d52..1de4a27aac 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -20,8 +20,8 @@ def ontology_to_unstructured_elements( ontology_element: ontology.OntologyElement, - parent_id: str = None, - page_number: int = None, + parent_id: str | None = None, + page_number: int | None = None, depth: int = 0, filename: str | None = None, add_img_alt_text: bool = True, @@ -51,7 +51,7 @@ def ontology_to_unstructured_elements( Returns: list[Element]: A list of unstructured Element objects. """ - elements_to_return = [] + elements_to_return: list[elements.Element] = [] if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT: if page_number is None and isinstance(ontology_element, ontology.Page): page_number = ontology_element.page_number @@ -71,7 +71,7 @@ def ontology_to_unstructured_elements( ), ) ] - children = [] + children: list[elements.Element] = [] for child in ontology_element.children: child = ontology_to_unstructured_elements( child, @@ -86,12 +86,14 @@ def ontology_to_unstructured_elements( combined_children = combine_inline_elements(children) elements_to_return += combined_children else: - element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] + element_class: type[elements.Element] = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ + ontology_element.__class__ + ] html_code_of_ontology_element = ontology_element.to_html() element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text) unstructured_element = element_class( - text=element_text, + text=element_text, # type: ignore element_id=ontology_element.id, detection_origin="vlm_partitioner", metadata=elements.ElementMetadata( @@ -126,9 +128,9 @@ def combine_inline_elements(elements: list[elements.Element]) -> list[elements.E Returns: list[Element]: A list of combined elements. """ - result_elements = [] + result_elements: list[elements.Element] = [] - current_element = None + current_element: elements.Element | None = None for next_element in elements: if current_element is None: current_element = next_element @@ -445,7 +447,7 @@ def extract_tag_and_ontology_class_from_tag( return html_tag, element_class -def get_escaped_attributes(soup: Tag): +def get_escaped_attributes(soup: Tag) -> dict[str, str | list[str]]: """ Escapes the attributes of a BeautifulSoup Tag object. @@ -455,7 +457,7 @@ def get_escaped_attributes(soup: Tag): Returns: dict: A dictionary with escaped attribute names and values. """ - escaped_attrs = {} + escaped_attrs: dict[str, str | list[str]] = {} for key, value in soup.attrs.items(): escaped_key = html.escape(key) escaped_value = None From 5f8211e2a5f7fb4dcaef80acb0fc250210ad1e60 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Fri, 6 Jun 2025 13:12:31 +0200 Subject: [PATCH 2/9] Fix tests --- .../documents/html_files/example.html | 46 +- .../html_files/example_full_doc.html | 1336 ++++++++--------- .../example_with_alternative_text.html | 10 +- .../example_with_inline_fields.html | 14 +- .../documents/html_files/three_tables.html | 8 +- .../test_ontology_to_unstructured_parsing.py | 5 + .../unstructured_json_output/example.json | 20 +- .../example_full_doc.json | 186 +-- .../example_with_alternative_text.json | 8 +- .../example_with_inline_fields.json | 8 +- .../three_tables.json | 6 +- ...t_html_to_unstructured_and_back_parsing.py | 216 +-- ...structured_elements_to_ontology_parsing.py | 45 +- .../partition/html/transformations.py | 22 +- 14 files changed, 959 insertions(+), 971 deletions(-) diff --git a/test_unstructured/documents/html_files/example.html b/test_unstructured/documents/html_files/example.html index 14be089463..3abd541255 100644 --- a/test_unstructured/documents/html_files/example.html +++ b/test_unstructured/documents/html_files/example.html @@ -1,41 +1,41 @@ - -
-
-

+ +
+
+

Header

-
-
-