Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 3f52e82

Browse files
authored
fix: Updates to hOCR Template to follow hOCR Spec (#195)
- Added validation in testing with https://github.com/kba/hocr-spec-python
1 parent e05cf50 commit 3f52e82

10 files changed

Lines changed: 60 additions & 38 deletions

File tree

google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
77
<meta name="ocr-system" content="Document AI OCR" />
88
<meta name="ocr-langs" content="unknown" />
9+
<meta name="ocr-scripts" content="unknown" />
910
<meta name="ocr-number-of-pages" content="{{ pages|length }}" />
10-
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
11+
<meta name="ocr-capabilities" content="ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
1112
</head>
1213
<body>
1314
{% for page in pages -%}
@@ -16,13 +17,13 @@
1617
{% set bidx = loop.index0 -%}
1718
<span class='ocr_carea' id='block_{{ page_number }}_{{ bidx }}' title='{{ docai_block.hocr_bounding_box -}}'>{% for paragraph in docai_block.paragraphs -%}
1819
{% set paridx = loop.index0 -%}
19-
<span class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
20+
<p class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
2021
{% set lidx = loop.index0 -%}
2122
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}{% for token in line.tokens -%}
2223
{% set tidx = loop.index0 -%}
2324
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}
2425
</span>{% endfor -%}
25-
</span>{% endfor -%}
26+
</p>{% endfor -%}
2627
</span>{% endfor -%}
2728
</div>
2829
{% endfor -%}

samples/snippets/test_convert_document_to_hocr_sample.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ def test_convert_document_to_hocr_sample() -> None:
2424
document_path=document_path, document_title=document_title
2525
)
2626

27-
with open("../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f:
27+
with open(
28+
"../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml",
29+
"r",
30+
encoding="utf-8",
31+
) as f:
2832
expected = f.read()
2933

3034
assert actual == expected

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
"immutabledict >= 2.0.0, < 3.0.0dev; python_version<'3.8'",
6767
"Pillow >= 9.5.0, < 11.0.0",
6868
"Jinja2 >= 3.1.0, <= 4.0.0",
69+
"hocr-spec >= 0.2.0",
6970
),
7071
python_requires=">=3.7",
7172
classifiers=[

testing/constraints-3.10.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ google-cloud-documentai
1111
google-cloud-storage
1212
numpy
1313
pikepdf
14+
hocr-spec

testing/constraints-3.11.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ google-cloud-documentai
1111
google-cloud-storage
1212
numpy
1313
pikepdf
14+
hocr-spec

testing/constraints-3.7.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ google-cloud-documentai==2.20.0
1414
google-cloud-storage==2.7.0
1515
numpy==1.19.5
1616
pikepdf==6.2.9
17+
hocr-spec==0.2.0

testing/constraints-3.8.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ google-cloud-documentai
1111
google-cloud-storage
1212
numpy==1.21.6
1313
pikepdf==8.2.3
14+
hocr-spec

testing/constraints-3.9.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ google-cloud-documentai
1111
google-cloud-storage
1212
numpy
1313
pikepdf
14+
hocr-spec

tests/unit/resources/toolbox_invoice_test_0_hocr.xml

Lines changed: 34 additions & 33 deletions
Large diffs are not rendered by default.

tests/unit/test_document.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
from io import BytesIO
1718
import json
1819
import os
1920
import shutil
@@ -32,6 +33,8 @@
3233
from google.cloud import documentai
3334
from google.cloud.documentai_toolbox import document, gcs_utilities
3435

36+
from hocr_spec import HocrValidator
37+
3538

3639
def get_bytes(file_name):
3740
result = []
@@ -689,8 +692,15 @@ def test_export_hocr_str():
689692
)
690693

691694
actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0")
695+
assert actual_hocr
696+
validator = HocrValidator(profile="standard")
697+
report = validator.validate(BytesIO(actual_hocr.encode("utf-8")), parse_strict=True)
698+
699+
assert report.format("bool")
692700

693-
with open("tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f:
701+
with open(
702+
"tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8"
703+
) as f:
694704
expected = f.read()
695705

696706
assert actual_hocr == expected

0 commit comments

Comments
 (0)