Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 735514e

Browse files
galz10holtskinner
andauthored
feat: add config based annotation converter (#72)
* feat: converters for external document annotations * added tests for converter * added converter tests * added more tests to increase coverage * combined converter files * lint and test fixes * lint and docs fixes * addressed comments * lint fix * Update google/cloud/documentai_toolbox/converters/config/converter_helpers.py Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> * updated comments and function naming * fixed failing tests * added samples and lint fixes * fixed region tags * fix region tag * Update samples/snippets/convert_document_from_azure_sample.py Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> * Update samples/snippets/convert_document_from_azure_sample.py Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> * lint fixes * added more comments * Update samples/snippets/convert_document_from_azure_sample.py Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> * Update samples/snippets/convert_document_from_azure_sample.py Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> * Update samples/snippets/convert_document_from_azure_sample.py Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> * changed name of converter sample per Holts comment --------- Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
1 parent 3b7d604 commit 735514e

28 files changed

Lines changed: 2537 additions & 101 deletions

google/cloud/documentai_toolbox/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@
2525
)
2626

2727
from .converters import (
28-
converters,
28+
converter,
2929
)
3030

3131
from .utilities import (
3232
utilities,
3333
)
3434

35-
__all__ = (document, page, entity, converters, utilities)
35+
__all__ = (document, page, entity, converter, utilities)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright 2023 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright 2023 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
from typing import Callable
18+
from intervaltree import intervaltree
19+
20+
from google.cloud import documentai
21+
from google.cloud.documentai_v1.types import geometry
22+
23+
24+
def _midpoint_in_bpoly(
25+
box_a: geometry.BoundingPoly, box_b: geometry.BoundingPoly
26+
) -> bool:
27+
"""Returns whether the midpoint in box_a is inside box_b."""
28+
29+
# Calculate the midpoint of box_a.
30+
mid_x_a = (_get_norm_x_max(box_a) + _get_norm_x_min(box_a)) / 2.0
31+
mid_y_a = (_get_norm_y_max(box_a) + _get_norm_y_min(box_a)) / 2.0
32+
33+
max_x_b = _get_norm_x_max(box_b)
34+
min_x_b = _get_norm_x_min(box_b)
35+
max_y_b = _get_norm_y_max(box_b)
36+
min_y_b = _get_norm_y_min(box_b)
37+
38+
return min_x_b < mid_x_a < max_x_b and min_y_b < mid_y_a < max_y_b
39+
40+
41+
def _merge_text_anchors(
42+
text_anchor_1: documentai.Document.TextAnchor,
43+
text_anchor_2: documentai.Document.TextAnchor,
44+
) -> documentai.Document.TextAnchor:
45+
"""Merges two TextAnchor objects into one ascending sorted TextAnchor."""
46+
merged_text_anchor = documentai.Document.TextAnchor()
47+
intervals = []
48+
for text_segment in text_anchor_1.text_segments:
49+
intervals.append(
50+
intervaltree.Interval(text_segment.start_index, text_segment.end_index)
51+
)
52+
for text_segment in text_anchor_2.text_segments:
53+
intervals.append(
54+
intervaltree.Interval(text_segment.start_index, text_segment.end_index)
55+
)
56+
57+
interval_tree = intervaltree.IntervalTree(intervals)
58+
interval_tree.merge_overlaps(strict=False)
59+
ts = []
60+
for iv in sorted(interval_tree):
61+
ts.append(
62+
documentai.Document.TextAnchor.TextSegment(
63+
start_index=iv.begin, end_index=iv.end
64+
)
65+
)
66+
67+
merged_text_anchor.text_segments = ts
68+
return merged_text_anchor
69+
70+
71+
def _get_text_anchor_in_bbox(
72+
bbox: documentai.BoundingPoly,
73+
page: documentai.Document.Page,
74+
token_in_bounding_box_function: Callable[
75+
[documentai.BoundingPoly, documentai.BoundingPoly], bool
76+
] = _midpoint_in_bpoly,
77+
) -> documentai.Document.TextAnchor:
78+
"""Gets mergedTextAnchor of Tokens in `page` that fall inside the `bbox`."""
79+
80+
text_anchor = documentai.Document.TextAnchor()
81+
for token in page.tokens:
82+
if token_in_bounding_box_function(token.layout.bounding_poly, bbox):
83+
text_anchor = _merge_text_anchors(text_anchor, token.layout.text_anchor)
84+
return text_anchor
85+
86+
87+
def _get_norm_x_max(bbox: geometry.BoundingPoly) -> float:
88+
return max([vertex.x for vertex in bbox.normalized_vertices])
89+
90+
91+
def _get_norm_x_min(bbox: geometry.BoundingPoly) -> float:
92+
return min([vertex.x for vertex in bbox.normalized_vertices])
93+
94+
95+
def _get_norm_y_max(bbox: geometry.BoundingPoly) -> float:
96+
return max([vertex.y for vertex in bbox.normalized_vertices])
97+
98+
99+
def _get_norm_y_min(bbox: geometry.BoundingPoly) -> float:
100+
return min([vertex.y for vertex in bbox.normalized_vertices])
101+
102+
103+
def _normalize_coordinates(x, y) -> float:
104+
return round(float(x / y), 9)
105+
106+
107+
def _convert_to_pixels(x: float, conversion_rate: float) -> float:
108+
return x * conversion_rate
109+
110+
111+
def _convert_bbox_units(
112+
coordinate, input_bbox_units, width=None, height=None, multiplier=1
113+
) -> float:
114+
r"""Returns a converted coordinate.
115+
116+
Args:
117+
coordinate (float):
118+
Required.The coordinate from document.proto
119+
input_bbox_units (str):
120+
Required. The bounding box units.
121+
width (float):
122+
Optional.
123+
height (float):
124+
Optional.
125+
multiplier (float):
126+
Optional.
127+
128+
Returns:
129+
float:
130+
A converted coordinate.
131+
132+
"""
133+
final_coordinate = coordinate
134+
if input_bbox_units != "normalized":
135+
if input_bbox_units == "pxl":
136+
if width is None:
137+
final_coordinate = _normalize_coordinates(coordinate, height)
138+
else:
139+
final_coordinate = _normalize_coordinates(coordinate, width)
140+
if input_bbox_units == "inch":
141+
x = _convert_to_pixels(coordinate, 96)
142+
if width is None:
143+
final_coordinate = _normalize_coordinates(x, height)
144+
else:
145+
final_coordinate = _normalize_coordinates(x, width)
146+
if input_bbox_units == "cm":
147+
x = _convert_to_pixels(coordinate, 37.795)
148+
if width is None:
149+
final_coordinate = _normalize_coordinates(x, height)
150+
else:
151+
final_coordinate = _normalize_coordinates(x, width)
152+
153+
return final_coordinate * multiplier
154+
155+
156+
def _get_multiplier(
157+
docproto_coordinate: float, external_coordinate: float, input_bbox_units: str
158+
) -> float:
159+
r"""Returns a multiplier to use when converting bounding boxes.
160+
161+
Args:
162+
docproto_coordinate (float):
163+
Required.The coordinate from document.proto
164+
external_coordinate (float):
165+
Required.The coordinate from external annotations.
166+
input_bbox_units (str):
167+
Required. The bounding box units.
168+
Returns:
169+
float:
170+
multiplier to use when converting bounding boxes.
171+
172+
"""
173+
if input_bbox_units == "inch":
174+
converted = _convert_to_pixels(external_coordinate, 96)
175+
return docproto_coordinate / converted
176+
elif input_bbox_units == "cm":
177+
converted = _convert_to_pixels(external_coordinate, 37.795)
178+
return docproto_coordinate / converted
179+
else:
180+
return docproto_coordinate / external_coordinate
181+
182+
183+
def _convert_bbox_to_docproto_bbox(block) -> geometry.BoundingPoly:
184+
r"""Returns a converted bounding box from Block.
185+
186+
Args:
187+
block (Block):
188+
Required.
189+
Returns:
190+
geometry.BoundingPoly:
191+
A geometry.BoundingPoly from bounding box.
192+
193+
"""
194+
merged_bbox = geometry.BoundingPoly()
195+
x_multiplier = 1
196+
y_multiplier = 1
197+
coordinates = []
198+
nv = []
199+
200+
# _convert_bbox_units should check if external_bbox is list or not
201+
coordinates_object = block.bounding_box
202+
if coordinates_object == []:
203+
return coordinates_object
204+
205+
if block.page_width and block.page_height:
206+
x_multiplier = _get_multiplier(
207+
docproto_coordinate=block.docproto_width,
208+
external_coordinate=block.page_width,
209+
input_bbox_units=block.bounding_unit,
210+
)
211+
y_multiplier = _get_multiplier(
212+
docproto_coordinate=block.docproto_height,
213+
external_coordinate=block.page_height,
214+
input_bbox_units=block.bounding_unit,
215+
)
216+
217+
if block.bounding_type == "1":
218+
# Type 1 : bounding box has 4 (x,y) coordinates
219+
220+
if type(block.bounding_box) == list:
221+
for coordinate in coordinates_object:
222+
x = _convert_bbox_units(
223+
coordinate[f"{block.bounding_x}"],
224+
input_bbox_units=block.bounding_unit,
225+
width=block.docproto_width,
226+
multiplier=x_multiplier,
227+
)
228+
y = _convert_bbox_units(
229+
coordinate[f"{block.bounding_y}"],
230+
input_bbox_units=block.bounding_unit,
231+
height=block.docproto_height,
232+
multiplier=y_multiplier,
233+
)
234+
235+
coordinates.append({"x": x, "y": y})
236+
237+
coordinates_object = coordinates
238+
239+
elif block.bounding_type == "2":
240+
# Type 2 : bounding box has 1 (x,y) coordinates for the top left corner
241+
# and (width, height)
242+
original_x = coordinates_object[f"{block.bounding_x}"]
243+
original_y = coordinates_object[f"{block.bounding_y}"]
244+
245+
x = _convert_bbox_units(
246+
original_x,
247+
input_bbox_units=block.bounding_unit,
248+
width=block.page_width,
249+
multiplier=x_multiplier,
250+
)
251+
y = _convert_bbox_units(
252+
original_y,
253+
input_bbox_units=block.bounding_unit,
254+
width=block.page_height,
255+
multiplier=y_multiplier,
256+
)
257+
258+
# x_min_y_min
259+
coordinates.append({"x": x, "y": y})
260+
# x_max_y_min
261+
coordinates.append({"x": (x + block.bounding_width), "y": y})
262+
# x_max_y_max
263+
coordinates.append(
264+
{"x": (x + block.bounding_width), "y": (y + block.bounding_height)}
265+
)
266+
# x_min_y_max
267+
coordinates.append({"x": x, "y": (y + block.bounding_height)})
268+
269+
coordinates_object = coordinates
270+
elif block.bounding_type == "3":
271+
# Type 2 : bounding box has 1 (x,y) coordinates for the top left corner
272+
# and (width, height)
273+
for idx in range(0, len(block.bounding_box), 2):
274+
x = _convert_bbox_units(
275+
block.bounding_box[idx],
276+
input_bbox_units=block.bounding_unit,
277+
width=block.docproto_width,
278+
multiplier=x_multiplier,
279+
)
280+
y = _convert_bbox_units(
281+
block.bounding_box[idx + 1],
282+
input_bbox_units=block.bounding_unit,
283+
width=block.docproto_height,
284+
multiplier=y_multiplier,
285+
)
286+
287+
coordinates.append({"x": x, "y": y})
288+
289+
coordinates_object = coordinates
290+
291+
for coordinates in coordinates_object:
292+
nv.append(documentai.NormalizedVertex(x=coordinates["x"], y=coordinates["y"]))
293+
294+
merged_bbox.normalized_vertices = nv
295+
296+
return merged_bbox

0 commit comments

Comments
 (0)