Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
cccd359
feat: enable mypy session for documentai-toolbox
chalmerlowe Apr 16, 2026
6abd528
fix(documentai-toolbox): resolve mypy errors
chalmerlowe Apr 17, 2026
cbbacda
chore(documentai-toolbox): fix linting and formatting
chalmerlowe Apr 17, 2026
20d9ae8
fix(documentai-toolbox): move type ignore to correct line in gcs_util…
chalmerlowe Apr 17, 2026
67eb298
feat(documentai-toolbox): use ConfigOrData type alias for Block fields
chalmerlowe Apr 17, 2026
3ca1496
feat(documentai-toolbox): update ConfigOrData to include List and use…
chalmerlowe Apr 17, 2026
c2c4e4e
fix(documentai-toolbox): replace assert with explicit check in gcs_ut…
chalmerlowe Apr 17, 2026
1dea669
fix(documentai-toolbox): replace redundant cast with isinstance check…
chalmerlowe Apr 17, 2026
5c1fe11
style: blacken block.py
chalmerlowe Apr 17, 2026
94a5f3e
fix(documentai-toolbox): use specific types for Block fields and reve…
chalmerlowe Apr 17, 2026
1a972f0
fix(documentai-toolbox): fix mypy errors in block.py and bbox_convers…
chalmerlowe Apr 17, 2026
7126d34
fix(documentai-toolbox): fix remaining mypy errors in bbox_conversion.py
chalmerlowe Apr 17, 2026
f8312eb
style: blacken and format documentai-toolbox files
chalmerlowe Apr 17, 2026
bddf806
fix(documentai-toolbox): expect dict for bounding_box in Type 2 in bb…
chalmerlowe Apr 17, 2026
49a17b1
fix(documentai-toolbox): allow both list and dict for entities in blo…
chalmerlowe Apr 17, 2026
410c1ad
fix(documentai-toolbox): fix mypy error for storage import in gcs_uti…
chalmerlowe Apr 17, 2026
ff8695d
fix(lint): remove unused SimpleNamespace import in bbox_conversion.py
chalmerlowe Apr 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@
from .utilities import docai_utilities, gcs_utilities
from .wrappers import document, entity, page

__all__ = (document, page, entity, converter, docai_utilities, gcs_utilities)
__all__ = (
"document",
"page",
"entity",
"converter",
"docai_utilities",
"gcs_utilities",
)


class Python37DeprecationWarning(DeprecationWarning): # pragma: NO COVER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
#


from typing import Callable, List, Optional

from intervaltree import intervaltree
Expand Down Expand Up @@ -190,16 +191,21 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly:
y_multiplier = 1.0
normalized_vertices: List[documentai.NormalizedVertex] = []

if block.page_width and block.page_height:
if (
block.page_width
and block.page_height
and block.docproto_width is not None
and block.docproto_height is not None
):
x_multiplier = _get_multiplier(
docproto_coordinate=block.docproto_width,
external_coordinate=block.page_width,
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
)
y_multiplier = _get_multiplier(
docproto_coordinate=block.docproto_height,
external_coordinate=block.page_height,
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
)

if block.bounding_type == "1":
Expand All @@ -208,13 +214,13 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly:
for coordinate in block.bounding_box:
x = _convert_bbox_units(
coordinate[f"{block.bounding_x}"],
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
width=block.docproto_width,
multiplier=x_multiplier,
)
y = _convert_bbox_units(
coordinate[f"{block.bounding_y}"],
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
height=block.docproto_height,
multiplier=y_multiplier,
)
Expand All @@ -224,18 +230,24 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly:
elif block.bounding_type == "2":
# Type 2 : bounding box has 1 (x,y) coordinates for the top left corner
# and (width, height)
if not isinstance(block.bounding_box, dict):
raise TypeError("Expected dict for bounding_box in Type 2")
x_min = _convert_bbox_units(
block.bounding_box[f"{block.bounding_x}"],
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
width=block.page_width,
multiplier=x_multiplier,
)
y_min = _convert_bbox_units(
block.bounding_box[f"{block.bounding_y}"],
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
width=block.page_height,
multiplier=y_multiplier,
)
if block.bounding_width is None or block.bounding_height is None:
raise ValueError(
"bounding_width and bounding_height must be set for Type 2"
)
x_max = x_min + block.bounding_width
y_max = y_min + block.bounding_height
normalized_vertices.extend(
Expand All @@ -249,16 +261,18 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly:

elif block.bounding_type == "3":
# Type 3 : bounding_box: [x1, y1, x2, y2, x3, y3, x4, y4]
if not isinstance(block.bounding_box, list):
raise TypeError("Expected list for bounding_box in Type 3")
for idx in range(0, len(block.bounding_box), 2):
x = _convert_bbox_units(
block.bounding_box[idx],
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
width=block.docproto_width,
multiplier=x_multiplier,
)
y = _convert_bbox_units(
block.bounding_box[idx + 1],
input_bbox_units=block.bounding_unit,
input_bbox_units=block.bounding_unit or "normalized",
width=block.docproto_height,
multiplier=y_multiplier,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
import dataclasses
import json
from types import SimpleNamespace
from typing import List, Optional, Type
from typing import Any, List, Optional, Type, Union

from google.cloud import documentai


def _get_target_object(json_data: any, target_object: str) -> Optional[SimpleNamespace]:
def _get_target_object(json_data: Any, target_object: str) -> Any:
r"""Returns SimpleNamespace of target_object.

Args:
Expand Down Expand Up @@ -72,45 +72,39 @@ class Block:
page_number:
Optional.
"""
type_: SimpleNamespace = dataclasses.field(init=True, repr=False)
text: SimpleNamespace = dataclasses.field(init=True, repr=False)
bounding_box: Optional[SimpleNamespace] = dataclasses.field(
type_: Any = dataclasses.field(init=True, repr=False)
text: str = dataclasses.field(init=True, repr=False)
bounding_box: Optional[Union[SimpleNamespace, List[Any]]] = dataclasses.field(
init=True, repr=False, default=None
)
block_references: Optional[SimpleNamespace] = dataclasses.field(
init=True, repr=False, default=None
)
block_id: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
confidence: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
page_number: Optional[SimpleNamespace] = dataclasses.field(
block_references: Any = dataclasses.field(init=True, repr=False, default=None)
block_id: Optional[str] = dataclasses.field(init=False, repr=False, default=None)
confidence: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
page_width: Optional[SimpleNamespace] = dataclasses.field(
page_number: Optional[int] = dataclasses.field(init=False, repr=False, default=None)
page_width: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
page_height: Optional[SimpleNamespace] = dataclasses.field(
page_height: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_width: Optional[SimpleNamespace] = dataclasses.field(
bounding_width: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_height: Optional[SimpleNamespace] = dataclasses.field(
bounding_height: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_type: Optional[SimpleNamespace] = dataclasses.field(
bounding_type: Optional[str] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_unit: Optional[SimpleNamespace] = dataclasses.field(
bounding_unit: Optional[str] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_x: Optional[SimpleNamespace] = dataclasses.field(
bounding_x: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_y: Optional[SimpleNamespace] = dataclasses.field(
bounding_y: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
docproto_width: Optional[float] = dataclasses.field(
Expand Down Expand Up @@ -180,6 +174,8 @@ def load_blocks_from_schema(

blocks: List[Block] = []
ens = _get_target_object(objects, entities)
if not isinstance(ens, (list, dict)):
raise TypeError("Expected list or dict for entities")
for i in ens:
entity = i

Expand All @@ -203,11 +199,13 @@ def load_blocks_from_schema(
b = Block(
type_=block_type,
text=block_text,
bounding_box=_get_target_object(entity, normalized_vertices),
bounding_box=_get_target_object(entity, normalized_vertices)
if normalized_vertices is not None
else None,
)

if id_:
b.id_ = _get_target_object(entity, id_)
b.block_id = _get_target_object(entity, id_)
if confidence:
b.confidence = _get_target_object(entity, confidence)
if page_number and page_number in entity:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def convert_from_config(

print("-------- Converting Started --------")
files, labels, did_not_convert = _get_docproto_files(
futures_list, project_id, location, processor_id
list(futures_list), project_id, location, processor_id
)

print("-------- Finished Converting --------")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,17 +243,17 @@ def _generate_entity_annotations(
"""
entity_annotations: List[EntityAnnotation] = []
for token in page_info.page.tokens:
v: vision.Vertex = []
v: list[vision.Vertex] = []
if token.layout.bounding_poly.vertices:
for vertex in token.layout.bounding_poly.vertices:
v.append({"x": int(vertex.x), "y": int(vertex.y)})
v.append(vision.Vertex(x=int(vertex.x), y=int(vertex.y)))
else:
for normalized_vertex in token.layout.bounding_poly.normalized_vertices:
v.append(
{
"x": int(normalized_vertex.x * page_info.page.dimension.width),
"y": int(normalized_vertex.y * page_info.page.dimension.height),
}
vision.Vertex(
x=int(normalized_vertex.x * page_info.page.dimension.width),
y=int(normalized_vertex.y * page_info.page.dimension.height),
)
)

text_start_index = token.layout.text_anchor.text_segments[0].start_index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

from google.api_core.gapic_v1 import client_info

from google.cloud import documentai, documentai_toolbox, storage
from google.cloud import documentai # type: ignore[attr-defined]
from google.cloud import documentai_toolbox
from google.cloud import storage # type: ignore[attr-defined]
from google.cloud.documentai_toolbox import constants


Expand Down Expand Up @@ -91,6 +93,8 @@ def get_blobs(
if gcs_uri:
gcs_bucket_name, gcs_prefix = split_gcs_uri(gcs_uri)

if gcs_prefix is None:
raise TypeError("gcs_prefix cannot be None")
if re.match(constants.FILE_CHECK_REGEX, gcs_prefix):
raise ValueError("gcs_prefix cannot contain file types")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import glob
import os
import re
from typing import Dict, Iterator, List, Optional, Type, Union
from typing import Any, Dict, Iterable, Iterator, List, Optional, Type, Union

from google.api_core.client_options import ClientOptions
from google.api_core.operation import from_gapic as operation_from_gapic
Expand Down Expand Up @@ -51,7 +51,7 @@ def _document_layout_blocks_from_shards(
shards: List[documentai.Document],
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
def extract_blocks(
blocks: List[documentai.Document.DocumentLayout.DocumentLayoutBlock],
blocks: Iterable[documentai.Document.DocumentLayout.DocumentLayoutBlock],
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
queue = collections.deque(blocks)

Expand Down Expand Up @@ -325,8 +325,9 @@ def _dict_to_bigquery(
bq_client = bigquery.Client(
project=project_id, client_info=gcs_utilities._get_client_info()
)
resolved_project_id = project_id or bq_client.project
table_ref = bigquery.DatasetReference(
project=project_id, dataset_id=dataset_name
project=resolved_project_id, dataset_id=dataset_name
).table(table_name)

job_config = bigquery.LoadJobConfig(
Expand All @@ -345,7 +346,7 @@ def _dict_to_bigquery(


def _apply_text_offset(
documentai_object: Union[Dict[str, Dict], List], text_offset: int
documentai_object: Union[Dict[str, Any], List[Any]], text_offset: int
) -> None:
r"""Applies a text offset to all text_segments in `documentai_object`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ class Entity:

_image: Optional[Image.Image] = dataclasses.field(init=False, default=None)

def __post_init__(self, page_offset: int) -> None:
def __post_init__(self, page_offset: Optional[int]) -> None:
if page_offset is None:
page_offset = 0

self.type_ = self.documentai_object.type_

if self.documentai_object.mention_text:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@
from abc import ABC
import dataclasses
from functools import cached_property
from typing import Iterable, List, Optional, Type
from typing import Iterable, List, Optional, Type, TypeVar

import pandas as pd

from google.cloud import documentai
from google.cloud.documentai_toolbox.constants import ElementWithLayout
from google.cloud.documentai_toolbox.utilities import docai_utilities

T = TypeVar("T", bound="_BasePageElement")


@dataclasses.dataclass
class Table:
Expand Down Expand Up @@ -180,9 +182,7 @@ def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
"""
return self.documentai_object.layout.text_anchor.text_segments[0]

def _get_children_of_element(
self, potential_children: List["_BasePageElement"]
) -> List["_BasePageElement"]:
def _get_children_of_element(self, potential_children: List[T]) -> List[T]:
"""
Filters potential child elements to identify only those fully contained within this element.

Expand Down
19 changes: 15 additions & 4 deletions packages/google-cloud-documentai-toolbox/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,21 @@ def prerelease_deps(session, protobuf_implementation):
@nox.session(python=DEFAULT_PYTHON_VERSION)
def mypy(session):
"""Run the type checker."""

# TODO(https://github.com/googleapis/google-cloud-python/issues/16014):
# Enable mypy once this bug is fixed.
session.skip("Temporarily skip mypy. See issue 16014")
session.install(
"mypy<1.16.0",
Comment thread
chalmerlowe marked this conversation as resolved.
"types-requests",
"types-protobuf",
"pandas-stubs",
)
session.install("-e", ".")
session.run(
"mypy",
"-p",
"google.cloud.documentai_toolbox",
"--check-untyped-defs",
"--ignore-missing-imports",
*session.posargs,
)
Comment thread
chalmerlowe marked this conversation as resolved.


@nox.session(python=DEFAULT_PYTHON_VERSION)
Expand Down
Loading