Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 4f7ba1b

Browse files
authored
refactor: Refactor of hocr functions and fixing lost changes (#137)
* refactor: Reorganize hocr functions - Use more jinja templating instead of hardcoding strings - Simplified bounding box function - Changed parameter name for `_get_hocr_bounding_box` to `page_dimension` for more clarity. * samples: Added sample for convert to hocr * refactor: Reordering of classes in page.py * refactor: Re-added refactoring to remove extra `get_*()` methods in page.py - Added in #110 Lost in Merge * fix: Moved `templates` directory into package. - Required for template to work in installed library
1 parent a38b68c commit 4f7ba1b

8 files changed

Lines changed: 206 additions & 444 deletions

File tree

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="unknown" lang="unknown">
4+
<head>
5+
<title>{{ title }}</title>
6+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
7+
<meta name="ocr-system" content="Document AI OCR" />
8+
<meta name="ocr-langs" content="unknown" />
9+
<meta name="ocr-number-of-pages" content="{{ pages|length }}" />
10+
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
11+
</head>
12+
<body>
13+
{% for page in pages -%}
14+
{% set page_number = page.documentai_object.page_number -%}
15+
<div class='ocr_page' lang='unknown' title='{{ page.hocr_bounding_box -}}'>{% for docai_block in page.blocks -%}
16+
{% set bidx = loop.index0 -%}
17+
<span class='ocr_carea' id='block_{{ page_number }}_{{ bidx }}' title='{{ docai_block.hocr_bounding_box -}}'>{% for paragraph in docai_block.paragraphs -%}
18+
{% set paridx = loop.index0 -%}
19+
<span class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
20+
{% set lidx = loop.index0 -%}
21+
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}</span>{% for token in line.tokens -%}
22+
{% set tidx = loop.index0 -%}
23+
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}{% endfor -%}
24+
</span>{% endfor -%}
25+
</span>{% endfor -%}
26+
</div>
27+
{% endfor -%}
28+
</body>
29+
</html>

google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
from pikepdf import Pdf
4545

46-
from jinja2 import Environment, FileSystemLoader
46+
from jinja2 import Environment, PackageLoader
4747

4848

4949
def _entities_from_shards(
@@ -773,6 +773,12 @@ def export_images(
773773
def export_hocr_str(self, title: str) -> str:
774774
r"""Exports a string hOCR version of the Document.
775775
776+
The format for the id of the object follows as such:
777+
object_{page_index}_...
778+
779+
For example words will have the following id format:
780+
word_{page_index}_{block_index}_{paragraph_index}_{line_index}_{word_index}
781+
776782
Args:
777783
title (str):
778784
Required. The title for hocr_page and head.
@@ -781,15 +787,9 @@ def export_hocr_str(self, title: str) -> str:
781787
str:
782788
A string hOCR version of the Document
783789
"""
784-
environment = Environment(loader=FileSystemLoader("templates/"))
785-
template = environment.get_template("hocr_xml_template.txt")
786-
hocr_pages = ""
787-
number_of_pages = len(self.pages)
788-
for page_to_export in self.pages:
789-
hocr_pages += page_to_export.to_hocr()
790-
791-
content = template.render(
792-
hocr_pages=hocr_pages, number_of_pages=number_of_pages, title=title
790+
environment = Environment(
791+
loader=PackageLoader("google.cloud.documentai_toolbox", "templates")
793792
)
794-
793+
template = environment.get_template("hocr_document_template.xml.j2")
794+
content = template.render(pages=self.pages, title=title)
795795
return content

0 commit comments

Comments
 (0)