Skip to content

Commit d299095

Browse files
authored
feat: add page number support to v1 html partition (#4327)
This PR adds support for page-number when partitioning html using the v1 parser. - Add `page_number` support to the v1 HTML parser by reading `data-page-number `attributes from ancestor elements, consistent with v2 parser behavior - Add `_page_number` cached property on Flow using efficient parent-chain lookup (O(n) total vs O(n*depth) ancestor walk) - Wire page number into all three element-creation paths: text elements, images, and tables - Malformed `data-page-number` values are skipped and fall back to the nearest valid ancestor
1 parent 615782a commit d299095

7 files changed

Lines changed: 118 additions & 19 deletions

File tree

.pre-commit-config.yaml

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,9 @@ repos:
1313
- id: trailing-whitespace
1414
- id: mixed-line-ending
1515

16-
- repo: https://github.com/psf/black
17-
rev: 24.2.0
18-
hooks:
19-
- id: black
20-
args: ["--line-length=100"]
21-
language_version: python3
22-
2316
- repo: https://github.com/astral-sh/ruff-pre-commit
24-
rev: v0.2.1
17+
rev: v0.15.2
2518
hooks:
2619
- id: ruff
27-
args:
28-
["--fix"]
29-
30-
- repo: https://github.com/pycqa/flake8
31-
rev: 7.0.0
32-
hooks:
33-
- id: flake8
34-
language_version: python3
20+
args: ["--fix-only", "--show-fixes"]
21+
- id: ruff-format

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.22.18
2+
3+
### Enhancements
4+
- **Add page number support to v1 HTML parser**: The v1 HTML parser now reads `data-page-number` attributes from ancestor elements and includes the page number in element metadata, consistent with the v2 parser behavior.
5+
16
## 0.22.17
27

38
### Fixes

test_unstructured/partition/html/test_parser.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,27 @@ def it_computes_the_normalized_text_of_its_text_segments_to_help(
323323

324324
assert accum._normalized_text == "Ford... you're turning into a penguin."
325325

326+
# -- page_number --------------------------------------------------------
327+
328+
def it_includes_page_number_in_metadata_when_ancestor_has_data_page_number(self):
329+
html = '<div data-page-number="2"><p>text</p></div>'
330+
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
331+
accum = _ElementAccumulator(p)
332+
accum.add(TextSegment("Ford... you're turning into a penguin.", {}))
333+
334+
(element,) = accum.flush(None)
335+
336+
assert element.metadata.page_number == 2
337+
338+
def it_leaves_page_number_None_when_no_data_page_number_in_tree(self):
339+
p = etree.fromstring("<p/>", html_parser).xpath(".//p")[0]
340+
accum = _ElementAccumulator(p)
341+
accum.add(TextSegment("Ford... you're turning into a penguin.", {}))
342+
343+
(element,) = accum.flush(None)
344+
345+
assert element.metadata.page_number is None
346+
326347
# -- fixtures --------------------------------------------------------------------------------
327348

328349
@pytest.fixture()
@@ -418,6 +439,37 @@ def it_generates_the_document_elements_from_the_Flow_element(self):
418439
with pytest.raises(StopIteration):
419440
e = next(elements)
420441

442+
# -- ._page_number ----------------------------------------------------
443+
444+
def it_returns_None_when_no_data_page_number_in_tree(self):
445+
p = etree.fromstring("<div><p>text</p></div>", html_parser).xpath(".//p")[0]
446+
assert p._page_number is None
447+
448+
def it_finds_page_number_from_ancestor(self):
449+
html = '<div data-page-number="1"><p>text</p></div>'
450+
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
451+
assert p._page_number == 1
452+
453+
def it_finds_page_number_on_self(self):
454+
html = '<div data-page-number="3"><span>text</span></div>'
455+
div = etree.fromstring(html, html_parser).xpath(".//div")[0]
456+
assert div._page_number == 3
457+
458+
def it_returns_nearest_ancestors_page_number(self):
459+
html = '<div data-page-number="1"><div data-page-number="2"><p>text</p></div></div>'
460+
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
461+
assert p._page_number == 2
462+
463+
def it_returns_None_for_non_numeric_data_page_number(self):
464+
html = '<div data-page-number="abc"><p>text</p></div>'
465+
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
466+
assert p._page_number is None
467+
468+
def it_falls_back_to_outer_page_number_when_inner_is_non_numeric(self):
469+
html = '<div data-page-number="1"><div data-page-number="abc"><p>text</p></div></div>'
470+
p = etree.fromstring(html, html_parser).xpath(".//p")[0]
471+
assert p._page_number == 1
472+
421473
# -- ._element_from_text_or_tail() ------------------------------------
422474

423475
def it_assembles_text_and_tail_document_elements_to_help(self):

test_unstructured/partition/html/test_partition.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,3 +1517,37 @@ def test_partition_html_with_empty_content_raises_error(test_case, content):
15171517
assert len(elements) == 0
15181518
finally:
15191519
os.unlink(temp_filename)
1520+
1521+
1522+
# ================================================================================================
1523+
# PAGE NUMBER FROM data-page-number ATTRIBUTE
1524+
# ================================================================================================
1525+
1526+
1527+
def test_partition_html_assigns_page_number_from_data_page_number_attribute():
1528+
html_text = (
1529+
"<html><body>"
1530+
' <div data-page-number="1">'
1531+
" <p>First page content.</p>"
1532+
" <table><tr><td>Table on page 1</td></tr></table>"
1533+
" </div>"
1534+
' <div data-page-number="2">'
1535+
" <p>Second page content.</p>"
1536+
" </div>"
1537+
"</body></html>"
1538+
)
1539+
elements = partition_html(text=html_text)
1540+
1541+
page_1_elements = [e for e in elements if e.metadata.page_number == 1]
1542+
page_2_elements = [e for e in elements if e.metadata.page_number == 2]
1543+
1544+
assert len(page_1_elements) == 2
1545+
assert any(isinstance(e, Table) for e in page_1_elements)
1546+
assert len(page_2_elements) == 1
1547+
assert page_2_elements[0].text == "Second page content."
1548+
1549+
1550+
def test_partition_html_leaves_page_number_None_when_not_present():
1551+
html_text = "<html><body><p>No page markup.</p></body></html>"
1552+
elements = partition_html(text=html_text)
1553+
assert all(e.metadata.page_number is None for e in elements)

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.17" # pragma: no cover
1+
__version__ = "0.22.18" # pragma: no cover

unstructured/partition/html/parser.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ class _ElementAccumulator:
230230
- `flush()` resets the accumulator to its initial empty state.
231231
"""
232232

233-
def __init__(self, element: etree.ElementBase):
233+
def __init__(self, element: Flow):
234234
self._element = element
235235
self._text_segments: list[TextSegment] = []
236236

@@ -270,6 +270,7 @@ def flush(self, ElementCls: type[Element] | None) -> Iterator[Element]:
270270
metadata=ElementMetadata(
271271
**_consolidate_annotations(ts.annotation for ts in text_segments),
272272
category_depth=category_depth,
273+
page_number=self._element._page_number,
273274
),
274275
)
275276

@@ -349,6 +350,20 @@ class Flow(etree.ElementBase):
349350
def is_phrasing(self) -> bool:
350351
return False
351352

353+
@cached_property
354+
def _page_number(self) -> int | None:
355+
"""Page number from nearest ancestor (or self) with a valid `data-page-number` attribute."""
356+
page_attr = self.get("data-page-number")
357+
if page_attr is not None:
358+
try:
359+
return int(page_attr)
360+
except (ValueError, TypeError):
361+
pass
362+
parent = self.getparent()
363+
if parent is not None and isinstance(parent, Flow):
364+
return parent._page_number
365+
return None
366+
352367
def iter_elements(self) -> Iterator[Element]:
353368
"""Generate paragraph string for each block item within."""
354369
# -- place child elements in a queue --
@@ -507,6 +522,7 @@ def iter_elements(self) -> Iterator[Element]:
507522
image_mime_type=img_mime_type,
508523
image_base64=img_base64,
509524
image_url=img_url,
525+
page_number=self._page_number,
510526
),
511527
)
512528

@@ -544,7 +560,10 @@ def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
544560
if table_text == "":
545561
return
546562

547-
yield Table(table_text, metadata=ElementMetadata(text_as_html=html_table))
563+
yield Table(
564+
table_text,
565+
metadata=ElementMetadata(text_as_html=html_table, page_number=self._page_number),
566+
)
548567

549568

550569
class RemovedBlock(Flow):

unstructured/partition/pdf_image/ocr.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ def process_file_with_ocr(
181181
password=password,
182182
)
183183
image_paths = cast(List[str], _image_paths)
184+
184185
for i, image_path in enumerate(image_paths):
185186
extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None
186187
with PILImage.open(image_path) as image:
@@ -196,6 +197,7 @@ def process_file_with_ocr(
196197
table_ocr_agent=table_ocr_agent,
197198
)
198199
merged_page_layouts.append(merged_page_layout)
200+
199201
return DocumentLayout.from_pages(merged_page_layouts)
200202
except Exception as e:
201203
if os.path.isdir(filename) or os.path.isfile(filename):

0 commit comments

Comments
 (0)