Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 028bc37

Browse files
authored
fix: docai_utilities.py to return Optional (#176)
* fix: `docai_utilities.py` to return `Optional` - Should resolve customer reported issue in support case #47169701 relating to duplicate/inaccurate elements in hOCR output - Followup to: - #161 - #169 * Increase test coverage * Addressed review comments
1 parent 212639b commit 028bc37

6 files changed

Lines changed: 83 additions & 11 deletions

File tree

google/cloud/documentai_toolbox/utilities/docai_utilities.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@
1515
#
1616
"""Utilities for Document AI"""
1717

18-
from typing import Tuple
18+
from typing import Optional, Tuple
1919

2020
from google.cloud import documentai
2121

2222

2323
def get_bounding_box(
2424
bounding_poly: documentai.BoundingPoly,
2525
page_dimension: documentai.Document.Page.Dimension,
26-
) -> Tuple[int, int, int, int]:
26+
) -> Optional[Tuple[int, int, int, int]]:
2727
r"""Returns the bounding box of an element from the element bounding_poly and page dimensions.
2828
2929
Args:
@@ -35,10 +35,10 @@ def get_bounding_box(
3535
Returns:
3636
Tuple[int, int, int, int]:
3737
Bounding box coordinates in order (top, left, bottom, right).
38-
Returns `0, 0, 0, 0` if `bounding_poly.normalized_vertices` is empty.
38+
Returns `None` if `bounding_poly` or `bounding_poly.normalized_vertices` is empty.
3939
"""
40-
if not bounding_poly.normalized_vertices:
41-
return 0, 0, 0, 0
40+
if not bounding_poly or not bounding_poly.normalized_vertices:
41+
return None
4242

4343
vertices = [
4444
(

google/cloud/documentai_toolbox/wrappers/entity.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,11 @@ def crop_image(
9595
if not documentai_page.image:
9696
raise ValueError("Document does not contain images.")
9797

98-
top, left, bottom, right = docai_utilities.get_bounding_box(
98+
bbox = docai_utilities.get_bounding_box(
9999
bounding_poly=self.documentai_object.page_anchor.page_refs[0].bounding_poly,
100100
page_dimension=documentai_page.dimension,
101101
)
102+
if bbox is None:
103+
return None
102104
doc_image = Image.open(BytesIO(documentai_page.image.content))
103-
return doc_image.crop((top, left, bottom, right))
105+
return doc_image.crop(bbox)

google/cloud/documentai_toolbox/wrappers/page.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"""Wrappers for Document AI Page type."""
1717

1818
import dataclasses
19-
from typing import List, Optional, Union, cast
19+
from typing import cast, List, Optional, Union
2020

2121
import pandas as pd
2222

@@ -301,7 +301,7 @@ def _table_rows_from_documentai_table_rows(
301301
def _get_hocr_bounding_box(
302302
element_with_layout: ElementWithLayout,
303303
page_dimension: documentai.Document.Page.Dimension,
304-
) -> str:
304+
) -> Optional[str]:
305305
r"""Returns a hOCR bounding box string.
306306
307307
Args:
@@ -311,13 +311,21 @@ def _get_hocr_bounding_box(
311311
Required. Page dimension.
312312
313313
Returns:
314-
str:
314+
Optional[str]:
315315
hOCR bounding box sring.
316316
"""
317-
min_x, min_y, max_x, max_y = docai_utilities.get_bounding_box(
317+
if not element_with_layout.layout.bounding_poly:
318+
return None
319+
320+
bbox = docai_utilities.get_bounding_box(
318321
bounding_poly=element_with_layout.layout.bounding_poly,
319322
page_dimension=page_dimension,
320323
)
324+
325+
if not bbox:
326+
return None
327+
328+
min_x, min_y, max_x, max_y = bbox
321329
return f"bbox {min_x} {min_y} {max_x} {max_y}"
322330

323331

tests/unit/test_document.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,19 @@ def get_bytes_missing_shard_mock():
105105
yield byte_factory
106106

107107

108+
def create_document_with_images_without_bbox(get_bytes_images_mock):
109+
doc = document.Document.from_gcs(
110+
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
111+
)
112+
113+
del (
114+
doc.entities[0]
115+
.documentai_object.page_anchor.page_refs[0]
116+
.bounding_poly.normalized_vertices
117+
)
118+
return doc
119+
120+
108121
def test_get_shards_with_gcs_uri_contains_file_type():
109122
with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"):
110123
document._get_shards(
@@ -299,6 +312,13 @@ def test_document_from_document_path_with_single_shard():
299312
assert len(actual.pages) == 1
300313

301314

315+
def test_document_from_document_path_with_directory():
316+
actual = document.Document.from_document_path(
317+
document_path="tests/unit/resources/0/"
318+
)
319+
assert len(actual.pages) == 1
320+
321+
302322
def test_document_from_documentai_document_with_single_shard():
303323
with open(
304324
"tests/unit/resources/0/toolbox_invoice_test-0.json", "r", encoding="utf-8"
@@ -626,6 +646,7 @@ def test_export_images(get_bytes_images_mock):
626646
output_path = "resources/output/"
627647
if os.path.exists(output_path):
628648
shutil.rmtree(output_path)
649+
assert not os.path.exists(output_path)
629650

630651
doc = document.Document.from_gcs(
631652
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
@@ -648,6 +669,20 @@ def test_export_images(get_bytes_images_mock):
648669
shutil.rmtree(output_path)
649670

650671

672+
def test_export_images_empty_bounding_box(get_bytes_images_mock):
673+
output_path = "resources/output/"
674+
675+
doc = create_document_with_images_without_bbox(get_bytes_images_mock)
676+
actual = doc.export_images(
677+
output_path=output_path,
678+
output_file_prefix="exported_photo",
679+
output_file_extension="png",
680+
)
681+
get_bytes_images_mock.assert_called_once()
682+
683+
assert not actual
684+
685+
651686
def test_export_hocr_str():
652687
wrapped_document = document.Document.from_document_path(
653688
document_path="tests/unit/resources/0/toolbox_invoice_test-0.json"

tests/unit/test_entity.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,15 @@ def test_crop_image_without_page_image(docproto):
102102
match="Document does not contain images.",
103103
):
104104
doc.entities[0].crop_image(documentai_page=docproto.pages[0])
105+
106+
107+
def test_crop_image_empty_bounding_box(docproto):
108+
doc = document.Document.from_documentai_document(docproto)
109+
del (
110+
doc.entities[0]
111+
.documentai_object.page_anchor.page_refs[0]
112+
.bounding_poly.normalized_vertices
113+
)
114+
115+
actual = doc.entities[0].crop_image(documentai_page=docproto.pages[0])
116+
assert actual is None

tests/unit/test_page.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ def docproto_form_parser():
4040
return documentai.Document.from_json(f.read())
4141

4242

43+
@pytest.fixture
44+
def docproto_blank_document():
45+
with open("tests/unit/resources/blank_document.json", "r", encoding="utf-8") as f:
46+
return documentai.Document.from_json(f.read())
47+
48+
4349
def test_table_to_csv(docproto):
4450
docproto_page = docproto.pages[0]
4551
table = page.Table(
@@ -160,6 +166,15 @@ def test_get_hocr_bounding_box(docproto):
160166
assert hocr_bounding_box_with_vertices == "bbox 1310 220 1534 282"
161167

162168

169+
def test_get_hocr_bounding_box_with_blank_document(docproto_blank_document):
170+
hocr_bounding_box_normalized = page._get_hocr_bounding_box(
171+
element_with_layout=docproto_blank_document.pages[0],
172+
page_dimension=docproto_blank_document.pages[0].dimension,
173+
)
174+
175+
assert hocr_bounding_box_normalized is None
176+
177+
163178
# Class init Tests
164179

165180

0 commit comments

Comments
 (0)