Skip to content

Commit e131439

Browse files
authored
feat: add group_elements_by_parent_id utility function (#4207)
Add a utility function to group elements by their parent_id metadata field. This allows users to easily traverse document hierarchy by grouping elements that share the same parent. Includes an optional 'assign_orphans' parameter that, when True, assigns elements with no parent_id to the same group as the previous element. Fixes #1489
1 parent 4bbb1ff commit e131439

4 files changed

Lines changed: 102 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.18.33-dev0
2+
3+
### Enhancements
4+
- **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489)
5+
16
## 0.18.32
27

38
### Enhancements
@@ -33,10 +38,6 @@
3338
- **Fix EN DASH not cleaned by `clean_bullets`**: Added EN DASH (`\u2013`) to `UNICODE_BULLETS` pattern so `clean_bullets` properly removes EN DASH bullet points without requiring `clean_dashes` (fixes #4105)
3439
- **Change `languages` parameter default from `["auto"]` to `None`**: Updated default value in `detect_languages()` and `partition_epub()` functions. Behavior unchanged as `None` is converted to `["auto"]` internally. (fixes #2471)
3540
- Resolve GHSA-58pv-8j8x-9vj2
36-
37-
## 0.18.29
38-
39-
### Enhancement
4041
- use render mode data to determine if a character extracted by pdfminer is invisible or not
4142

4243
## 0.18.28

test_unstructured/test_utils.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,3 +331,55 @@ def test_calculate_shared_ngram_percentage_returns_null_vals_for_empty_str():
331331
percent, common_ngrams = utils.calculate_shared_ngram_percentage(str1, str2, n)
332332
assert percent == 0
333333
assert not bool(common_ngrams)
334+
335+
336+
class DescribeGroupElementsByParentId:
337+
"""Unit tests for group_elements_by_parent_id function."""
338+
339+
def it_groups_elements_by_parent_id_with_orphans_in_none_group(self):
340+
e1 = Title("Title 1")
341+
e1.metadata.parent_id = "parent_A"
342+
e2 = NarrativeText("Child of A")
343+
e2.metadata.parent_id = "parent_A"
344+
e3 = NarrativeText("Orphan 1") # parent_id = None
345+
e4 = Title("Title 2")
346+
e4.metadata.parent_id = "parent_B"
347+
e5 = NarrativeText("Orphan 2") # parent_id = None
348+
349+
elements = [e1, e2, e3, e4, e5]
350+
result = utils.group_elements_by_parent_id(elements)
351+
352+
assert list(result.keys()) == ["parent_A", None, "parent_B"]
353+
assert [e.text for e in result["parent_A"]] == ["Title 1", "Child of A"]
354+
assert [e.text for e in result[None]] == ["Orphan 1", "Orphan 2"]
355+
assert [e.text for e in result["parent_B"]] == ["Title 2"]
356+
357+
def it_assigns_orphans_to_previous_element_group_when_assign_orphans_is_true(self):
358+
e1 = Title("Title 1")
359+
e1.metadata.parent_id = "parent_A"
360+
e2 = NarrativeText("Child of A")
361+
e2.metadata.parent_id = "parent_A"
362+
e3 = NarrativeText("Orphan 1") # parent_id = None
363+
e4 = Title("Title 2")
364+
e4.metadata.parent_id = "parent_B"
365+
e5 = NarrativeText("Orphan 2") # parent_id = None
366+
367+
elements = [e1, e2, e3, e4, e5]
368+
result = utils.group_elements_by_parent_id(elements, assign_orphans=True)
369+
370+
assert list(result.keys()) == ["parent_A", "parent_B"]
371+
assert [e.text for e in result["parent_A"]] == ["Title 1", "Child of A", "Orphan 1"]
372+
assert [e.text for e in result["parent_B"]] == ["Title 2", "Orphan 2"]
373+
374+
def it_keeps_first_orphan_in_none_group_when_assign_orphans_is_true(self):
375+
e1 = NarrativeText("First orphan") # parent_id = None
376+
e2 = Title("Title 1")
377+
e2.metadata.parent_id = "parent_A"
378+
e3 = NarrativeText("Orphan 2") # parent_id = None
379+
380+
elements = [e1, e2, e3]
381+
result = utils.group_elements_by_parent_id(elements, assign_orphans=True)
382+
383+
assert list(result.keys()) == [None, "parent_A"]
384+
assert [e.text for e in result[None]] == ["First orphan"]
385+
assert [e.text for e in result["parent_A"]] == ["Title 1", "Orphan 2"]

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.32" # pragma: no cover
1+
__version__ = "0.18.33-dev0" # pragma: no cover

unstructured/utils.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,50 @@ def catch_overlapping_and_nested_bboxes(
746746
return document_with_overlapping_flag, overlapping_cases
747747

748748

749+
def group_elements_by_parent_id(
750+
elements: Iterable["Element"],
751+
assign_orphans: bool = False,
752+
) -> dict[Optional[str], list["Element"]]:
753+
"""Group elements by their parent_id metadata field.
754+
755+
Elements with the same parent_id are grouped together.
756+
757+
Args:
758+
elements: An iterable of Element objects to group.
759+
assign_orphans: If True, elements with no parent_id (None) will be assigned to
760+
the same group as the previous element. If False (default), elements with
761+
no parent are grouped under the None key.
762+
763+
Returns:
764+
A dictionary mapping parent_id values to lists of elements sharing that parent_id.
765+
766+
Example:
767+
>>> elements = partition("example.pdf")
768+
>>> grouped = group_elements_by_parent_id(elements)
769+
>>> for parent_id, children in grouped.items():
770+
... print(f"Parent {parent_id}: {len(children)} children")
771+
772+
>>> # Assign orphan elements to previous element's group
773+
>>> grouped = group_elements_by_parent_id(elements, assign_orphans=True)
774+
"""
775+
from collections import defaultdict
776+
777+
groups: dict[Optional[str], list["Element"]] = defaultdict(list)
778+
last_parent_id: Optional[str] = None
779+
780+
for element in elements:
781+
parent_id = getattr(element.metadata, "parent_id", None)
782+
783+
if parent_id is None and assign_orphans:
784+
parent_id = last_parent_id
785+
elif parent_id is not None:
786+
last_parent_id = parent_id
787+
788+
groups[parent_id].append(element)
789+
790+
return dict(groups)
791+
792+
749793
class FileHandler:
750794
def __init__(self, file_path: str):
751795
self.file_path = file_path

0 commit comments

Comments
 (0)