googleapis
diff --git a/‎google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2‎
Lines changed: 29 additions & 0 deletions b/‎google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎google/cloud/documentai_toolbox/wrappers/document.py‎
Lines changed: 11 additions & 11 deletions b/‎google/cloud/documentai_toolbox/wrappers/document.py‎
Lines changed: 11 additions & 11 deletions
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="unknown" lang="unknown">
+<head>
+<title>{{ title }}</title>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+<meta name="ocr-system" content="Document AI OCR" />
+<meta name="ocr-langs" content="unknown" />
+<meta name="ocr-number-of-pages" content="{{ pages|length }}" />
+<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
+</head>
+<body>
+{% for page in pages -%}
+    {% set page_number = page.documentai_object.page_number -%}
+    <div class='ocr_page' lang='unknown' title='{{ page.hocr_bounding_box -}}'>{% for docai_block in page.blocks -%}
+        {% set bidx = loop.index0 -%}
+        <span class='ocr_carea' id='block_{{ page_number }}_{{ bidx }}' title='{{ docai_block.hocr_bounding_box -}}'>{% for paragraph in docai_block.paragraphs -%}
+            {% set paridx = loop.index0 -%}
+            <span class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
+                {% set lidx = loop.index0 -%}
+                <span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}</span>{% for token in line.tokens -%}
+                    {% set tidx = loop.index0 -%}
+                    <span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}{% endfor -%}
+            </span>{% endfor -%}
+        </span>{% endfor -%}
+    </div>
+{% endfor -%}
+</body>
+</html>
@@ -43,7 +43,7 @@
 
 from pikepdf import Pdf
 
-from jinja2 import Environment, FileSystemLoader
+from jinja2 import Environment, PackageLoader
 
 
 def _entities_from_shards(
@@ -773,6 +773,12 @@ def export_images(
     def export_hocr_str(self, title: str) -> str:
         r"""Exports a string hOCR version of the Document.
 
+            The format for the id of the object follows as such:
+                object_{page_index}_...
+
+            For example words will have the following id format:
+                word_{page_index}_{block_index}_{paragraph_index}_{line_index}_{word_index}
+
         Args:
             title (str):
                 Required. The title for hocr_page and head.
@@ -781,15 +787,9 @@ def export_hocr_str(self, title: str) -> str:
             str:
                 A string hOCR version of the Document
         """
-        environment = Environment(loader=FileSystemLoader("templates/"))
-        template = environment.get_template("hocr_xml_template.txt")
-        hocr_pages = ""
-        number_of_pages = len(self.pages)
-        for page_to_export in self.pages:
-            hocr_pages += page_to_export.to_hocr()
-
-        content = template.render(
-            hocr_pages=hocr_pages, number_of_pages=number_of_pages, title=title
+        environment = Environment(
+            loader=PackageLoader("google.cloud.documentai_toolbox", "templates")
         )
-
+        template = environment.get_template("hocr_document_template.xml.j2")
+        content = template.render(pages=self.pages, title=title)
         return content