Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 3 additions & 16 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,9 @@ repos:
- id: trailing-whitespace
- id: mixed-line-ending

- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
- id: black
args: ["--line-length=100"]
language_version: python3

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.1
rev: v0.15.2
hooks:
- id: ruff
args:
["--fix"]

- repo: https://github.com/pycqa/flake8
rev: 7.0.0
hooks:
- id: flake8
language_version: python3
args: ["--fix-only", "--show-fixes"]
- id: ruff-format
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.22.18

### Enhancements
- **Add page number support to v1 HTML parser**: The v1 HTML parser now reads `data-page-number` attributes from ancestor elements and includes the page number in element metadata, consistent with the v2 parser behavior.

## 0.22.17

### Fixes
Expand Down
52 changes: 52 additions & 0 deletions test_unstructured/partition/html/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,27 @@ def it_computes_the_normalized_text_of_its_text_segments_to_help(

assert accum._normalized_text == "Ford... you're turning into a penguin."

# -- page_number --------------------------------------------------------

def it_includes_page_number_in_metadata_when_ancestor_has_data_page_number(self):
html = '<div data-page-number="2"><p>text</p></div>'
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
accum = _ElementAccumulator(p)
accum.add(TextSegment("Ford... you're turning into a penguin.", {}))

(element,) = accum.flush(None)

assert element.metadata.page_number == 2

def it_leaves_page_number_None_when_no_data_page_number_in_tree(self):
p = etree.fromstring("<p/>", html_parser).xpath(".//p")[0]
accum = _ElementAccumulator(p)
accum.add(TextSegment("Ford... you're turning into a penguin.", {}))

(element,) = accum.flush(None)

assert element.metadata.page_number is None

# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
Expand Down Expand Up @@ -418,6 +439,37 @@ def it_generates_the_document_elements_from_the_Flow_element(self):
with pytest.raises(StopIteration):
e = next(elements)

# -- ._page_number ----------------------------------------------------

def it_returns_None_when_no_data_page_number_in_tree(self):
p = etree.fromstring("<div><p>text</p></div>", html_parser).xpath(".//p")[0]
assert p._page_number is None

def it_finds_page_number_from_ancestor(self):
html = '<div data-page-number="1"><p>text</p></div>'
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
assert p._page_number == 1

def it_finds_page_number_on_self(self):
html = '<div data-page-number="3"><span>text</span></div>'
div = etree.fromstring(html, html_parser).xpath(".//div")[0]
assert div._page_number == 3

def it_returns_nearest_ancestors_page_number(self):
html = '<div data-page-number="1"><div data-page-number="2"><p>text</p></div></div>'
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
assert p._page_number == 2

def it_returns_None_for_non_numeric_data_page_number(self):
html = '<div data-page-number="abc"><p>text</p></div>'
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
assert p._page_number is None

def it_falls_back_to_outer_page_number_when_inner_is_non_numeric(self):
html = '<div data-page-number="1"><div data-page-number="abc"><p>text</p></div></div>'
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
assert p._page_number == 1

# -- ._element_from_text_or_tail() ------------------------------------

def it_assembles_text_and_tail_document_elements_to_help(self):
Expand Down
34 changes: 34 additions & 0 deletions test_unstructured/partition/html/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,37 @@ def test_partition_html_with_empty_content_raises_error(test_case, content):
assert len(elements) == 0
finally:
os.unlink(temp_filename)


# ================================================================================================
# PAGE NUMBER FROM data-page-number ATTRIBUTE
# ================================================================================================


def test_partition_html_assigns_page_number_from_data_page_number_attribute():
html_text = (
"<html><body>"
' <div data-page-number="1">'
" <p>First page content.</p>"
" <table><tr><td>Table on page 1</td></tr></table>"
" </div>"
' <div data-page-number="2">'
" <p>Second page content.</p>"
" </div>"
"</body></html>"
)
elements = partition_html(text=html_text)

page_1_elements = [e for e in elements if e.metadata.page_number == 1]
page_2_elements = [e for e in elements if e.metadata.page_number == 2]

assert len(page_1_elements) == 2
assert any(isinstance(e, Table) for e in page_1_elements)
assert len(page_2_elements) == 1
assert page_2_elements[0].text == "Second page content."


def test_partition_html_leaves_page_number_None_when_not_present():
html_text = "<html><body><p>No page markup.</p></body></html>"
elements = partition_html(text=html_text)
assert all(e.metadata.page_number is None for e in elements)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.17" # pragma: no cover
__version__ = "0.22.18" # pragma: no cover
23 changes: 21 additions & 2 deletions unstructured/partition/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ class _ElementAccumulator:
- `flush()` resets the accumulator to its initial empty state.
"""

def __init__(self, element: etree.ElementBase):
def __init__(self, element: Flow):
self._element = element
self._text_segments: list[TextSegment] = []

Expand Down Expand Up @@ -270,6 +270,7 @@ def flush(self, ElementCls: type[Element] | None) -> Iterator[Element]:
metadata=ElementMetadata(
**_consolidate_annotations(ts.annotation for ts in text_segments),
category_depth=category_depth,
page_number=self._element._page_number,
),
)

Expand Down Expand Up @@ -349,6 +350,20 @@ class Flow(etree.ElementBase):
def is_phrasing(self) -> bool:
return False

@cached_property
def _page_number(self) -> int | None:
"""Page number from nearest ancestor (or self) with a valid `data-page-number` attribute."""
page_attr = self.get("data-page-number")
if page_attr is not None:
try:
return int(page_attr)
except (ValueError, TypeError):
pass
parent = self.getparent()
if parent is not None and isinstance(parent, Flow):
return parent._page_number
return None

def iter_elements(self) -> Iterator[Element]:
"""Generate paragraph string for each block item within."""
# -- place child elements in a queue --
Expand Down Expand Up @@ -507,6 +522,7 @@ def iter_elements(self) -> Iterator[Element]:
image_mime_type=img_mime_type,
image_base64=img_base64,
image_url=img_url,
page_number=self._page_number,
),
)

Expand Down Expand Up @@ -544,7 +560,10 @@ def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
if table_text == "":
return

yield Table(table_text, metadata=ElementMetadata(text_as_html=html_table))
yield Table(
table_text,
metadata=ElementMetadata(text_as_html=html_table, page_number=self._page_number),
)


class RemovedBlock(Flow):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def process_file_with_ocr(
password=password,
)
image_paths = cast(List[str], _image_paths)

for i, image_path in enumerate(image_paths):
extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None
with PILImage.open(image_path) as image:
Expand All @@ -196,6 +197,7 @@ def process_file_with_ocr(
table_ocr_agent=table_ocr_agent,
)
merged_page_layouts.append(merged_page_layout)

return DocumentLayout.from_pages(merged_page_layouts)
except Exception as e:
if os.path.isdir(filename) or os.path.isfile(filename):
Expand Down
Loading