Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit c877b22

Browse files
authored
feat: Add Support for Layout Parser Documents (#334)
- Adds `Document.chunks` - `List[documentai.Document.ChunkedDocument.Chunk]` - Adds `Document.document_layout_blocks` - An un-nested List of `documentai.Document.DocumentLayout.DocumentLayoutBlock` - Updated `Document.text` to support Document AI Layout Parser
1 parent 2352cae commit c877b22

File tree

6 files changed

+100
-4
lines changed

6 files changed

+100
-4
lines changed

google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@
1515
#
1616
"""Wrappers for Document AI Document type."""
1717

18+
import collections
1819
import copy
1920
import dataclasses
2021
from functools import cached_property
2122
import glob
2223
import os
2324
import re
24-
from typing import Dict, List, Optional, Type, Union
25+
from typing import Dict, Iterator, List, Optional, Type, Union
2526

2627
from google.api_core.client_options import ClientOptions
2728
from google.api_core.operation import from_gapic as operation_from_gapic
@@ -38,6 +39,33 @@
3839
from google.cloud.documentai_toolbox.wrappers.page import FormField, Page
3940

4041

42+
def _chunks_from_shards(
43+
shards: List[documentai.Document],
44+
) -> Iterator[documentai.Document.ChunkedDocument.Chunk]:
45+
for shard in shards:
46+
for chunk in shard.chunked_document.chunks:
47+
yield chunk
48+
49+
50+
def _document_layout_blocks_from_shards(
51+
shards: List[documentai.Document],
52+
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
53+
def extract_blocks(
54+
blocks: List[documentai.Document.DocumentLayout.DocumentLayoutBlock],
55+
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
56+
queue = collections.deque(blocks)
57+
58+
while queue:
59+
block = queue.popleft()
60+
yield block
61+
# Add the nested blocks to the stack in the correct order
62+
if block.text_block and block.text_block.blocks:
63+
queue.extendleft(reversed(block.text_block.blocks))
64+
65+
for shard in shards:
66+
yield from extract_blocks(shard.document_layout.blocks)
67+
68+
4169
def _entities_from_shards(
4270
shards: List[documentai.Document],
4371
) -> List[Entity]:
@@ -379,7 +407,11 @@ class Document:
379407
pages (List[Page]):
380408
A list of `Pages` in the `Document`.
381409
entities (List[Entity]):
382-
A list of `Entities` in the `Document`.
410+
A list of un-nested `Entities` in the `Document`.
411+
chunks (Iterator[documentai.Document.ChunkedDocument.Chunk]):
412+
An iterator of document chunks extracted from a Layout Parser.
413+
document_layout_blocks (Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]):
414+
An iterator of document layout blocks extracted from a Layout Parser.
383415
text (str):
384416
The full text of the `Document`.
385417
"""
@@ -398,6 +430,14 @@ def pages(self):
398430
def entities(self):
399431
return _entities_from_shards(shards=self.shards)
400432

433+
@cached_property
434+
def chunks(self):
435+
return _chunks_from_shards(shards=self.shards)
436+
437+
@cached_property
438+
def document_layout_blocks(self):
439+
return _document_layout_blocks_from_shards(shards=self.shards)
440+
401441
@cached_property
402442
def text(self):
403443
return "".join(shard.text for shard in self.shards)

samples/snippets/quickstart_sample.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# flake8: noqa: C901
12
# Copyright 2023 Google LLC
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -127,6 +128,20 @@ def quickstart_sample(
127128
if entity.normalized_text:
128129
print(f"\tNormalized Text: {entity.normalized_text}")
129130

131+
# Only supported with Layout Parser
132+
for chunk in wrapped_document.chunks:
133+
print(f"Chunk {chunk.chunk_id}: {chunk.content}")
134+
135+
for block in wrapped_document.document_layout_blocks:
136+
print(f"Document Layout Block {block.block_id}")
137+
138+
if block.text_block:
139+
print(f"{block.text_block.type_}: {block.text_block.text}")
140+
if block.list_block:
141+
print(f"{block.list_block.type_}: {block.list_block.list_entries}")
142+
if block.table_block:
143+
print(block.table_block.header_rows, block.table_block.body_rows)
144+
130145
# [END documentai_toolbox_quickstart]
131146

132147
return wrapped_document

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
"proto-plus>=1.22.3, <2.0.0dev",
6161
"grpc-google-iam-v1>=0.12.6, <1.0.0dev",
6262
"google-cloud-bigquery>=3.5.0, <4.0.0dev",
63-
"google-cloud-documentai>=2.20.0, <3.0.0dev",
63+
"google-cloud-documentai>=2.29.2, <3.0.0dev",
6464
"google-cloud-storage>=1.31.0, <3.0.0dev",
6565
"google-cloud-vision>=2.7.0, <4.0.0dev",
6666
"numpy>=1.23.5, <2.0.0",

testing/constraints-3.8.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ pandas==2.0.0
1010
proto-plus==1.22.3
1111
grpc-google-iam-v1==0.12.6
1212
google-cloud-bigquery==3.5.0
13-
google-cloud-documentai==2.20.0
13+
google-cloud-documentai==2.29.2
1414
google-cloud-storage==2.7.0
1515
pandas-gbq==0.21.0
1616
numpy==1.23.5

tests/unit/resources/layout_parser/layout_parser.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/unit/test_document.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,46 @@ def test_document_from_documentai_document_with_single_shard():
395395
assert len(actual.text) > 0
396396

397397

398+
def test_document_from_documentai_document_layout_parser():
399+
with open(
400+
"tests/unit/resources/layout_parser/layout_parser.json", "r", encoding="utf-8"
401+
) as f:
402+
doc = documentai.Document.from_json(f.read())
403+
404+
actual = document.Document.from_documentai_document(documentai_document=doc)
405+
406+
chunk_list = list(actual.chunks)
407+
assert len(chunk_list) == 2
408+
assert chunk_list[0].chunk_id == "c1"
409+
assert "CHAPTER I" in chunk_list[0].content
410+
assert chunk_list[0].page_span.page_start == 1
411+
assert chunk_list[0].page_span.page_end == 8
412+
413+
assert chunk_list[1].chunk_id == "c2"
414+
assert "Was that me?" in chunk_list[1].content
415+
assert chunk_list[1].page_span.page_start == 8
416+
assert chunk_list[1].page_span.page_end == 15
417+
418+
block_list = list(actual.document_layout_blocks)
419+
420+
for i, block in enumerate(block_list, start=1):
421+
assert int(block.block_id) == i
422+
423+
assert len(block_list) == 175
424+
assert block_list[0].block_id == "1"
425+
assert block_list[0].text_block.text == "CHAPTER I"
426+
assert block_list[0].text_block.type_ == "heading-1"
427+
assert block_list[0].text_block.blocks
428+
assert block_list[0].page_span.page_start == 1
429+
assert block_list[0].page_span.page_end == 8
430+
431+
assert block_list[1].block_id == "2"
432+
assert block_list[1].text_block.text == "IN WHICH We Are Introduced to"
433+
assert block_list[1].text_block.type_ == "paragraph"
434+
assert block_list[1].page_span.page_start == 1
435+
assert block_list[1].page_span.page_end == 1
436+
437+
398438
def test_document_from_gcs_with_single_shard(get_bytes_single_file_mock):
399439
actual = document.Document.from_gcs(
400440
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"

0 commit comments

Comments
 (0)