feat: Add Support for Layout Parser Documents (#334)

holtskinner · web-flow · commit c877b22367ad · 2024-07-16T13:09:10.000-07:00
- Adds `Document.chunks` - `List[documentai.Document.ChunkedDocument.Chunk]`
- Adds `Document.document_layout_blocks` - An un-nested List of `documentai.Document.DocumentLayout.DocumentLayoutBlock`
- Updated `Document.text` to support Document AI Layout Parser
diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -15,13 +15,14 @@
 #
 """Wrappers for Document AI Document type."""
 
+import collections
 import copy
 import dataclasses
 from functools import cached_property
 import glob
 import os
 import re
-from typing import Dict, List, Optional, Type, Union
+from typing import Dict, Iterator, List, Optional, Type, Union
 
 from google.api_core.client_options import ClientOptions
 from google.api_core.operation import from_gapic as operation_from_gapic
@@ -38,6 +39,33 @@
 from google.cloud.documentai_toolbox.wrappers.page import FormField, Page
 
 
+def _chunks_from_shards(
+    shards: List[documentai.Document],
+) -> Iterator[documentai.Document.ChunkedDocument.Chunk]:
+    for shard in shards:
+        for chunk in shard.chunked_document.chunks:
+            yield chunk
+
+
+def _document_layout_blocks_from_shards(
+    shards: List[documentai.Document],
+) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
+    def extract_blocks(
+        blocks: List[documentai.Document.DocumentLayout.DocumentLayoutBlock],
+    ) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
+        queue = collections.deque(blocks)
+
+        while queue:
+            block = queue.popleft()
+            yield block
+            # Add the nested blocks to the stack in the correct order
+            if block.text_block and block.text_block.blocks:
+                queue.extendleft(reversed(block.text_block.blocks))
+
+    for shard in shards:
+        yield from extract_blocks(shard.document_layout.blocks)
+
+
 def _entities_from_shards(
     shards: List[documentai.Document],
 ) -> List[Entity]:
@@ -379,7 +407,11 @@ class Document:
         pages (List[Page]):
             A list of `Pages` in the `Document`.
         entities (List[Entity]):
-            A list of `Entities` in the `Document`.
+            A list of un-nested `Entities` in the `Document`.
+        chunks (Iterator[documentai.Document.ChunkedDocument.Chunk]):
+            An iterator of document chunks extracted from a Layout Parser.
+        document_layout_blocks (Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]):
+            An iterator of document layout blocks extracted from a Layout Parser.
         text (str):
             The full text of the `Document`.
     """
@@ -398,6 +430,14 @@ def pages(self):
     def entities(self):
         return _entities_from_shards(shards=self.shards)
 
+    @cached_property
+    def chunks(self):
+        return _chunks_from_shards(shards=self.shards)
+
+    @cached_property
+    def document_layout_blocks(self):
+        return _document_layout_blocks_from_shards(shards=self.shards)
+
     @cached_property
     def text(self):
         return "".join(shard.text for shard in self.shards)
diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py
@@ -1,3 +1,4 @@
+# flake8: noqa: C901
 # Copyright 2023 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -127,6 +128,20 @@ def quickstart_sample(
         if entity.normalized_text:
             print(f"\tNormalized Text: {entity.normalized_text}")
 
+    # Only supported with Layout Parser
+    for chunk in wrapped_document.chunks:
+        print(f"Chunk {chunk.chunk_id}: {chunk.content}")
+
+    for block in wrapped_document.document_layout_blocks:
+        print(f"Document Layout Block {block.block_id}")
+
+        if block.text_block:
+            print(f"{block.text_block.type_}: {block.text_block.text}")
+        if block.list_block:
+            print(f"{block.list_block.type_}: {block.list_block.list_entries}")
+        if block.table_block:
+            print(block.table_block.header_rows, block.table_block.body_rows)
+
     # [END documentai_toolbox_quickstart]
 
     return wrapped_document
diff --git a/setup.py b/setup.py
@@ -60,7 +60,7 @@
         "proto-plus>=1.22.3, <2.0.0dev",
         "grpc-google-iam-v1>=0.12.6, <1.0.0dev",
         "google-cloud-bigquery>=3.5.0, <4.0.0dev",
-        "google-cloud-documentai>=2.20.0, <3.0.0dev",
+        "google-cloud-documentai>=2.29.2, <3.0.0dev",
         "google-cloud-storage>=1.31.0, <3.0.0dev",
         "google-cloud-vision>=2.7.0, <4.0.0dev",
         "numpy>=1.23.5, <2.0.0",
diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt
@@ -10,7 +10,7 @@ pandas==2.0.0
 proto-plus==1.22.3
 grpc-google-iam-v1==0.12.6
 google-cloud-bigquery==3.5.0
-google-cloud-documentai==2.20.0
+google-cloud-documentai==2.29.2
 google-cloud-storage==2.7.0
 pandas-gbq==0.21.0
 numpy==1.23.5
diff --git a/tests/unit/resources/layout_parser/layout_parser.json b/tests/unit/resources/layout_parser/layout_parser.json
diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py
@@ -395,6 +395,46 @@ def test_document_from_documentai_document_with_single_shard():
     assert len(actual.text) > 0
 
 
+def test_document_from_documentai_document_layout_parser():
+    with open(
+        "tests/unit/resources/layout_parser/layout_parser.json", "r", encoding="utf-8"
+    ) as f:
+        doc = documentai.Document.from_json(f.read())
+
+    actual = document.Document.from_documentai_document(documentai_document=doc)
+
+    chunk_list = list(actual.chunks)
+    assert len(chunk_list) == 2
+    assert chunk_list[0].chunk_id == "c1"
+    assert "CHAPTER I" in chunk_list[0].content
+    assert chunk_list[0].page_span.page_start == 1
+    assert chunk_list[0].page_span.page_end == 8
+
+    assert chunk_list[1].chunk_id == "c2"
+    assert "Was that me?" in chunk_list[1].content
+    assert chunk_list[1].page_span.page_start == 8
+    assert chunk_list[1].page_span.page_end == 15
+
+    block_list = list(actual.document_layout_blocks)
+
+    for i, block in enumerate(block_list, start=1):
+        assert int(block.block_id) == i
+
+    assert len(block_list) == 175
+    assert block_list[0].block_id == "1"
+    assert block_list[0].text_block.text == "CHAPTER I"
+    assert block_list[0].text_block.type_ == "heading-1"
+    assert block_list[0].text_block.blocks
+    assert block_list[0].page_span.page_start == 1
+    assert block_list[0].page_span.page_end == 8
+
+    assert block_list[1].block_id == "2"
+    assert block_list[1].text_block.text == "IN WHICH We Are Introduced to"
+    assert block_list[1].text_block.type_ == "paragraph"
+    assert block_list[1].page_span.page_start == 1
+    assert block_list[1].page_span.page_end == 1
+
+
 def test_document_from_gcs_with_single_shard(get_bytes_single_file_mock):
     actual = document.Document.from_gcs(
         gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"