Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit d843e51

Browse files
fix: Prevent sorting entities labeled in Document AI Workbench (#200)
* Refactored Entity loading and changed sorting to only occur if all ids are digits. --------- Co-authored-by: Holt Skinner <holtskinner@google.com>
1 parent 378ebd6 commit d843e51

3 files changed

Lines changed: 34 additions & 17 deletions

File tree

google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,22 @@ def _entities_from_shards(
5050
List[Entity]:
5151
a list of Entities.
5252
"""
53-
result = []
54-
# Needed to load the correct page index for sharded documents.
55-
page_offset = 0
56-
for shard in shards:
57-
entities = [
58-
Entity(documentai_object=entity, page_offset=page_offset)
59-
for entity in shard.entities
60-
]
61-
properties = [
62-
Entity(documentai_object=prop, page_offset=page_offset)
63-
for entity in shard.entities
64-
for prop in entity.properties
65-
]
66-
result.extend(entities + properties)
67-
page_offset += len(shard.pages)
53+
result = [
54+
Entity(
55+
documentai_object=item,
56+
# Needed to load the correct page index for sharded documents.
57+
page_offset=sum(len(shard.pages) for shard in shards[:i]),
58+
)
59+
for i, shard in enumerate(shards)
60+
for entity in shard.entities
61+
for item in (entity, *entity.properties)
62+
]
6863

69-
if len(result) > 1 and result[0].documentai_object.id:
64+
# https://github.com/googleapis/python-documentai-toolbox/issues/199
65+
# Only sort entities if the ids are all numeric.
66+
# Document AI Workbench labeling outputs hexadecimal ids which should not be sorted.
67+
# Sorting numeric ids is needed for backwards-compatible behavior.
68+
if len(result) > 1 and all(item.documentai_object.id.isdigit() for item in result):
7069
result.sort(key=lambda x: int(x.documentai_object.id))
7170
return result
7271

tests/unit/resources/hex_ids/patent.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/unit/test_document.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def create_document_with_images_without_bbox(get_bytes_images_mock):
109109
doc = document.Document.from_gcs(
110110
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
111111
)
112+
get_bytes_images_mock.assert_called_once()
112113

113114
del (
114115
doc.entities[0]
@@ -169,7 +170,7 @@ def test_pages_from_shards():
169170
assert page.page_number == page_index + 1
170171

171172

172-
def test_entities_from_shard():
173+
def test_entities_from_shards():
173174
shards = []
174175
for byte in get_bytes("tests/unit/resources/0"):
175176
shards.append(documentai.Document.from_json(byte))
@@ -183,6 +184,22 @@ def test_entities_from_shard():
183184
assert actual[1].normalized_text == "140 USD"
184185

185186

187+
# For documents labeled in Document AI Workbench
188+
def test_entities_from_shards_with_hex_ids():
189+
shards = []
190+
for byte in get_bytes("tests/unit/resources/hex_ids"):
191+
shards.append(documentai.Document.from_json(byte))
192+
193+
actual = document._entities_from_shards(shards=shards)
194+
195+
assert actual[0].documentai_object.id == "ef4fd8a921c0ea81"
196+
assert actual[0].mention_text == "453,945"
197+
assert actual[0].type_ == "application_number"
198+
assert actual[1].documentai_object.id == "ef4fd8a921c0e000"
199+
assert actual[1].mention_text == "G06F 1/26"
200+
assert actual[1].type_ == "class_international"
201+
202+
186203
@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai")
187204
def test_get_batch_process_metadata_with_valid_operation(
188205
mock_docai,

0 commit comments

Comments
 (0)