diff --git a/CHANGELOG.md b/CHANGELOG.md index 2140662067..15d307657c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.11-dev0 + +### Enhancements + +### Features + +### Fixes +- Invalid elements IDs are not visible in VLM output. Parent-child hierarchy is now retrieved based on unstructured element ID, instead of id injected into HTML code of element. + ## 0.17.10 - Drop Python 3.9 support as it reaches EOL in October 2025 - Update pip-compile script to use Python 3.10 and newer diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py index 5789a83d14..019810e196 100644 --- a/scripts/html/rendered_html_from_elements.py +++ b/scripts/html/rendered_html_from_elements.py @@ -10,16 +10,12 @@ """ import argparse +import html import logging import os import select import sys -from collections import defaultdict -from typing import List, Sequence -from bs4 import BeautifulSoup - -from unstructured.documents import elements from unstructured.partition.html.transformations import unstructured_elements_to_ontology from unstructured.staging.base import elements_from_json @@ -28,48 +24,6 @@ logger = logging.getLogger(__name__) -def extract_document_div(html_content: str) -> str: - pos = html_content.find(">") - if pos != -1: - return html_content[: pos + 1] - logger.error("No '>' found in the HTML content.") - raise ValueError("No '>' found in the HTML content.") - - -def extract_page_div(html_content: str) -> str: - soup = BeautifulSoup(html_content, "html.parser") - page_divs = soup.find_all("div", class_="Page") - if len(page_divs) != 1: - logger.error( - "Expected exactly one
element with class 'Page'. Found %d.", len(page_divs) - ) - raise ValueError("Expected exactly one
element with class 'Page'.") - return str(page_divs[0]) - - -def fold_document_div( - html_document_start: str, html_document_end: str, html_per_page: List[str] -) -> str: - html_document = html_document_start - for page_html in html_per_page: - html_document += page_html - html_document += html_document_end - return html_document - - -def group_elements_by_page( - unstructured_elements: Sequence[elements.Element], -) -> Sequence[Sequence[elements.Element]]: - pages_dict = defaultdict(list) - - for element in unstructured_elements: - page_number = element.metadata.page_number - pages_dict[page_number].append(element) - - pages_list = list(pages_dict.values()) - return pages_list - - def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: """Renders HTML from a JSON file with unstructured elements. @@ -91,18 +45,10 @@ def rendered_html(*, filepath: str | None = None, text: str | None = None) -> st logger.info("Rendering HTML from text.") unstructured_elements = elements_from_json(filename=filepath, text=text) - unstructured_elements_per_page = group_elements_by_page(unstructured_elements) - # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) - parsed_ontology_per_page = [ - unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page - ] - html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] - - html_document_start = extract_document_div(html_per_page[0]) - html_document_end = "
" - html_per_page = [extract_page_div(page) for page in html_per_page] - - return fold_document_div(html_document_start, html_document_end, html_per_page) + ontology_root = unstructured_elements_to_ontology(unstructured_elements) + html_document = ontology_root.to_html() + unescaped_html = html.unescape(html_document) + return unescaped_html def _main(): diff --git a/test_unstructured/documents/html_files/example.html b/test_unstructured/documents/html_files/example.html index 14be089463..3abd541255 100644 --- a/test_unstructured/documents/html_files/example.html +++ b/test_unstructured/documents/html_files/example.html @@ -1,41 +1,41 @@ - -
-
-

+ +
+
+

Header

-
-
-