diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 54a1200f08..3d42505174 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,22 +13,9 @@ repos: - id: trailing-whitespace - id: mixed-line-ending - - repo: https://github.com/psf/black - rev: 24.2.0 - hooks: - - id: black - args: ["--line-length=100"] - language_version: python3 - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.1 + rev: v0.15.2 hooks: - id: ruff - args: - ["--fix"] - - - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 - hooks: - - id: flake8 - language_version: python3 + args: ["--fix-only", "--show-fixes"] + - id: ruff-format diff --git a/CHANGELOG.md b/CHANGELOG.md index bbf8846d79..49bab66762 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.22.18 + +### Enhancements +- **Add page number support to v1 HTML parser**: The v1 HTML parser now reads `data-page-number` attributes from ancestor elements and includes the page number in element metadata, consistent with the v2 parser behavior. + ## 0.22.17 ### Fixes diff --git a/test_unstructured/partition/html/test_parser.py b/test_unstructured/partition/html/test_parser.py index c3873db8cc..7eae36354b 100644 --- a/test_unstructured/partition/html/test_parser.py +++ b/test_unstructured/partition/html/test_parser.py @@ -323,6 +323,27 @@ def it_computes_the_normalized_text_of_its_text_segments_to_help( assert accum._normalized_text == "Ford... you're turning into a penguin." + # -- page_number -------------------------------------------------------- + + def it_includes_page_number_in_metadata_when_ancestor_has_data_page_number(self): + html = '

text

' + p = etree.fromstring(html, html_parser).xpath(".//p")[0] + accum = _ElementAccumulator(p) + accum.add(TextSegment("Ford... you're turning into a penguin.", {})) + + (element,) = accum.flush(None) + + assert element.metadata.page_number == 2 + + def it_leaves_page_number_None_when_no_data_page_number_in_tree(self): + p = etree.fromstring("

", html_parser).xpath(".//p")[0] + accum = _ElementAccumulator(p) + accum.add(TextSegment("Ford... you're turning into a penguin.", {})) + + (element,) = accum.flush(None) + + assert element.metadata.page_number is None + # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() @@ -418,6 +439,37 @@ def it_generates_the_document_elements_from_the_Flow_element(self): with pytest.raises(StopIteration): e = next(elements) + # -- ._page_number ---------------------------------------------------- + + def it_returns_None_when_no_data_page_number_in_tree(self): + p = etree.fromstring("

text

", html_parser).xpath(".//p")[0] + assert p._page_number is None + + def it_finds_page_number_from_ancestor(self): + html = '

text

' + p = etree.fromstring(html, html_parser).xpath(".//p")[0] + assert p._page_number == 1 + + def it_finds_page_number_on_self(self): + html = '
text
' + div = etree.fromstring(html, html_parser).xpath(".//div")[0] + assert div._page_number == 3 + + def it_returns_nearest_ancestors_page_number(self): + html = '

text

' + p = etree.fromstring(html, html_parser).xpath(".//p")[0] + assert p._page_number == 2 + + def it_returns_None_for_non_numeric_data_page_number(self): + html = '

text

' + p = etree.fromstring(html, html_parser).xpath(".//p")[0] + assert p._page_number is None + + def it_falls_back_to_outer_page_number_when_inner_is_non_numeric(self): + html = '

text

' + p = etree.fromstring(html, html_parser).xpath(".//p")[0] + assert p._page_number == 1 + # -- ._element_from_text_or_tail() ------------------------------------ def it_assembles_text_and_tail_document_elements_to_help(self): diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py index ca0e14b01b..7018b4b4ef 100644 --- a/test_unstructured/partition/html/test_partition.py +++ b/test_unstructured/partition/html/test_partition.py @@ -1517,3 +1517,37 @@ def test_partition_html_with_empty_content_raises_error(test_case, content): assert len(elements) == 0 finally: os.unlink(temp_filename) + + +# ================================================================================================ +# PAGE NUMBER FROM data-page-number ATTRIBUTE +# ================================================================================================ + + +def test_partition_html_assigns_page_number_from_data_page_number_attribute(): + html_text = ( + "" + '
' + "

First page content.

" + "
Table on page 1
" + "
" + '
' + "

Second page content.

" + "
" + "" + ) + elements = partition_html(text=html_text) + + page_1_elements = [e for e in elements if e.metadata.page_number == 1] + page_2_elements = [e for e in elements if e.metadata.page_number == 2] + + assert len(page_1_elements) == 2 + assert any(isinstance(e, Table) for e in page_1_elements) + assert len(page_2_elements) == 1 + assert page_2_elements[0].text == "Second page content." + + +def test_partition_html_leaves_page_number_None_when_not_present(): + html_text = "

No page markup.

" + elements = partition_html(text=html_text) + assert all(e.metadata.page_number is None for e in elements) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d817f93f7e..7faa8e2d5d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.17" # pragma: no cover +__version__ = "0.22.18" # pragma: no cover diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index 5f4d09d401..b0ffa54973 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -230,7 +230,7 @@ class _ElementAccumulator: - `flush()` resets the accumulator to its initial empty state. """ - def __init__(self, element: etree.ElementBase): + def __init__(self, element: Flow): self._element = element self._text_segments: list[TextSegment] = [] @@ -270,6 +270,7 @@ def flush(self, ElementCls: type[Element] | None) -> Iterator[Element]: metadata=ElementMetadata( **_consolidate_annotations(ts.annotation for ts in text_segments), category_depth=category_depth, + page_number=self._element._page_number, ), ) @@ -349,6 +350,20 @@ class Flow(etree.ElementBase): def is_phrasing(self) -> bool: return False + @cached_property + def _page_number(self) -> int | None: + """Page number from nearest ancestor (or self) with a valid `data-page-number` attribute.""" + page_attr = self.get("data-page-number") + if page_attr is not None: + try: + return int(page_attr) + except (ValueError, TypeError): + pass + parent = self.getparent() + if parent is not None and isinstance(parent, Flow): + return parent._page_number + return None + def iter_elements(self) -> Iterator[Element]: """Generate paragraph string for each block item within.""" # -- place child elements in a queue -- @@ -507,6 +522,7 @@ def iter_elements(self) -> Iterator[Element]: image_mime_type=img_mime_type, image_base64=img_base64, image_url=img_url, + page_number=self._page_number, ), ) @@ -544,7 +560,10 @@ def iter_cell_texts(tr: etree._Element) -> Iterator[str]: if table_text == "": return - yield Table(table_text, metadata=ElementMetadata(text_as_html=html_table)) + yield Table( + table_text, + metadata=ElementMetadata(text_as_html=html_table, page_number=self._page_number), + ) class RemovedBlock(Flow): diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 9414fa8a8d..21a5d6ebab 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -181,6 +181,7 @@ def process_file_with_ocr( password=password, ) image_paths = cast(List[str], _image_paths) + for i, image_path in enumerate(image_paths): extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None with PILImage.open(image_path) as image: @@ -196,6 +197,7 @@ def process_file_with_ocr( table_ocr_agent=table_ocr_agent, ) merged_page_layouts.append(merged_page_layout) + return DocumentLayout.from_pages(merged_page_layouts) except Exception as e: if os.path.isdir(filename) or os.path.isfile(filename):