Skip to content

Commit e2d8b7a

Browse files
authored
fix: remap parent id after hashing (#4245)
This PR addresses an issue where hashing element id loses the reference for parent id. This happens when calling `partition_html` where the partition process already assigned parent ids for elements based on html structure before `apply_metadata` is called, i.e., before element id hashing happens. This fix ensures that the parent references stay unchanged after hashing.
1 parent c1f819c commit e2d8b7a

4 files changed

Lines changed: 74 additions & 2 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.20.6
2+
3+
- fix: remap parent id after hashing to preserve right reference
4+
15
## 0.20.5
26

37
### Fixes

test_unstructured/partition/common/test_metadata.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,3 +482,59 @@ def test_assign_hash_ids_produces_unique_and_deterministic_SHA1_ids_even_for_dup
482482
assert len(ids) == len(set(ids))
483483
# -- ids are deterministic, same value is computed each time --
484484
assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))
485+
486+
487+
def test_assign_hash_ids_remaps_parent_id_to_new_hash_id():
488+
"""parent_id values (originally UUIDs) are updated to the corresponding hash IDs."""
489+
title = Title(text="Title", metadata=ElementMetadata(filename="foo.bar", page_number=1))
490+
child = Text(
491+
text="Child",
492+
metadata=ElementMetadata(filename="foo.bar", page_number=1, parent_id=title.id),
493+
)
494+
# -- sanity-check: ids are UUIDs before hashing --
495+
assert len(title.id) == 36
496+
assert child.metadata.parent_id == title.id
497+
498+
_assign_hash_ids([title, child])
499+
500+
# -- ids are now SHA1 hashes --
501+
assert len(title.id) == 32
502+
# -- parent_id has been updated to the new hash id, not the old UUID --
503+
assert child.metadata.parent_id == title.id
504+
505+
506+
def test_assign_hash_ids_leaves_unknown_parent_id_unchanged():
507+
"""A parent_id that has no matching element (e.g. filtered out) is left as-is, not KeyError."""
508+
external_parent_id = "some-external-or-filtered-id"
509+
orphan = Text(
510+
text="Orphan",
511+
metadata=ElementMetadata(filename="foo.bar", page_number=1, parent_id=external_parent_id),
512+
)
513+
514+
# -- should not raise KeyError even though external_parent_id is not in id_mapping --
515+
_assign_hash_ids([orphan])
516+
517+
# -- parent_id is left unchanged because it wasn't in the mapping --
518+
assert orphan.metadata.parent_id == external_parent_id
519+
520+
521+
def test_partition_html_parent_child_relationships_preserved_with_hash_ids():
522+
"""Integration: partition_html with unique_element_ids=False preserves parent-child links."""
523+
from unstructured.partition.html import partition_html
524+
525+
html = "<html><body><h1>My Title</h1><p>My paragraph</p></body></html>"
526+
elements = partition_html(text=html, unique_element_ids=False)
527+
528+
# -- all element ids should be SHA1 hashes (32 hex chars), not UUIDs (36 chars) --
529+
assert all(len(e.id) == 32 for e in elements), "Expected SHA1 hash IDs"
530+
531+
# -- find the title and its child paragraph --
532+
title = next((e for e in elements if isinstance(e, Title)), None)
533+
child = next(
534+
(e for e in elements if e.metadata.parent_id is not None),
535+
None,
536+
)
537+
assert title is not None, "Expected a Title element"
538+
assert child is not None, "Expected at least one element with a parent_id"
539+
# -- parent_id must point to the hashed title id, not an old UUID --
540+
assert child.metadata.parent_id == title.id

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.20.5" # pragma: no cover
1+
__version__ = "0.20.6" # pragma: no cover

unstructured/partition/common/metadata.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,20 +244,32 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
244244

245245

246246
def _assign_hash_ids(elements: list[Element]) -> list[Element]:
247-
"""Converts `.id` of each element from UUID to hash.
247+
"""Converts `.id` of each element from UUID to hash and remaps `parent_id` accordingly.
248248
249249
The hash is based on the `.text` of the element, but also on its page-number and sequence number
250250
on that page. This provides for deterministic results even when the document is split into one
251251
or more fragments for parallel processing.
252+
253+
After hashing, any `element.metadata.parent_id` that references a known original UUID is
254+
updated to the corresponding new hash ID. Parent IDs that do not appear in the mapping (e.g.
255+
because the parent element was filtered out before hashing, or the ID was set manually to an
256+
external value) are left unchanged.
252257
"""
253258
# -- generate sequence number for each element on a page --
254259
page_seq_counts = {}
260+
id_mapping = {}
255261
for element in elements:
256262
page_number = element.metadata.page_number
257263
seq_on_page_counter = page_seq_counts.get(page_number, 0)
264+
original_id = element.id
258265
element.id_to_hash(seq_on_page_counter)
266+
id_mapping[original_id] = element.id
259267
page_seq_counts[page_number] = seq_on_page_counter + 1
260268

269+
for element in elements:
270+
if element.metadata.parent_id is not None and element.metadata.parent_id in id_mapping:
271+
element.metadata.parent_id = id_mapping[element.metadata.parent_id]
272+
261273
return elements
262274

263275

0 commit comments

Comments
 (0)