This repository was archived by the owner on Mar 6, 2026. It is now read-only.
File tree Expand file tree Collapse file tree 4 files changed +330
-2
lines changed
google/cloud/documentai_toolbox/templates Expand file tree Collapse file tree 4 files changed +330
-2
lines changed Original file line number Diff line number Diff line change 1919 {% set paridx = loop .index 0 -%}
2020 <p class =' ocr_par' id =' par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title =' {{ paragraph.hocr_bounding_box -}}' >{% for line in paragraph .lines -%}
2121 {% set lidx = loop .index 0 -%}
22- <span class =' ocr_line' id =' line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title =' {{ line.hocr_bounding_box }}' >{{ line.text }}{% for token in line .tokens -%}
22+ <span class =' ocr_line' id =' line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title =' {{ line.hocr_bounding_box }}' >{{ line.text|escape }}{% for token in line .tokens -%}
2323 {% set tidx = loop .index 0 -%}
24- <span class =' ocrx_word' id =' word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title =' {{ token.hocr_bounding_box }}' >{{ token.text }}</span >{% endfor -%}
24+ <span class =' ocrx_word' id =' word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title =' {{ token.hocr_bounding_box }}' >{{ token.text|escape }}</span >{% endfor -%}
2525 </span >{% endfor -%}
2626 </p >{% endfor -%}
2727 </span >{% endfor -%}
Original file line number Diff line number Diff line change 1+ {
2+ "text" : " <Invoice>" ,
3+ "pages" : [
4+ {
5+ "pageNumber" : 1 ,
6+ "dimension" : {
7+ "width" : 1758.0 ,
8+ "height" : 2275.0 ,
9+ "unit" : " pixels"
10+ },
11+ "layout" : {
12+ "textAnchor" : {
13+ "textSegments" : [
14+ {
15+ "endIndex" : " 435"
16+ }
17+ ]
18+ },
19+ "boundingPoly" : {
20+ "vertices" : [
21+ {},
22+ {
23+ "x" : 1758
24+ },
25+ {
26+ "x" : 1758 ,
27+ "y" : 2275
28+ },
29+ {
30+ "y" : 2275
31+ }
32+ ],
33+ "normalizedVertices" : [
34+ {},
35+ {
36+ "x" : 1.0
37+ },
38+ {
39+ "x" : 1.0 ,
40+ "y" : 1.0
41+ },
42+ {
43+ "y" : 1.0
44+ }
45+ ]
46+ },
47+ "orientation" : 1
48+ },
49+ "detectedLanguages" : [
50+ {
51+ "languageCode" : " en"
52+ },
53+ {
54+ "languageCode" : " und"
55+ }
56+ ],
57+ "blocks" : [
58+ {
59+ "layout" : {
60+ "textAnchor" : {
61+ "textSegments" : [
62+ {
63+ "endIndex" : " 8"
64+ }
65+ ]
66+ },
67+ "confidence" : 0.99258333 ,
68+ "boundingPoly" : {
69+ "vertices" : [
70+ {
71+ "x" : 1310 ,
72+ "y" : 220
73+ },
74+ {
75+ "x" : 1534 ,
76+ "y" : 220
77+ },
78+ {
79+ "x" : 1534 ,
80+ "y" : 282
81+ },
82+ {
83+ "x" : 1310 ,
84+ "y" : 282
85+ }
86+ ],
87+ "normalizedVertices" : [
88+ {
89+ "x" : 0.74516493 ,
90+ "y" : 0.0967033
91+ },
92+ {
93+ "x" : 0.8725825 ,
94+ "y" : 0.0967033
95+ },
96+ {
97+ "x" : 0.8725825 ,
98+ "y" : 0.12395605
99+ },
100+ {
101+ "x" : 0.74516493 ,
102+ "y" : 0.12395605
103+ }
104+ ]
105+ },
106+ "orientation" : 1
107+ }
108+ }
109+ ],
110+ "paragraphs" : [
111+ {
112+ "layout" : {
113+ "textAnchor" : {
114+ "textSegments" : [
115+ {
116+ "endIndex" : " 8"
117+ }
118+ ]
119+ },
120+ "confidence" : 0.99258333 ,
121+ "boundingPoly" : {
122+ "vertices" : [
123+ {
124+ "x" : 1310 ,
125+ "y" : 220
126+ },
127+ {
128+ "x" : 1534 ,
129+ "y" : 220
130+ },
131+ {
132+ "x" : 1534 ,
133+ "y" : 282
134+ },
135+ {
136+ "x" : 1310 ,
137+ "y" : 282
138+ }
139+ ],
140+ "normalizedVertices" : [
141+ {
142+ "x" : 0.74516493 ,
143+ "y" : 0.0967033
144+ },
145+ {
146+ "x" : 0.8725825 ,
147+ "y" : 0.0967033
148+ },
149+ {
150+ "x" : 0.8725825 ,
151+ "y" : 0.12395605
152+ },
153+ {
154+ "x" : 0.74516493 ,
155+ "y" : 0.12395605
156+ }
157+ ]
158+ },
159+ "orientation" : 1
160+ }
161+ }
162+ ],
163+ "lines" : [
164+ {
165+ "layout" : {
166+ "textAnchor" : {
167+ "textSegments" : [
168+ {
169+ "endIndex" : " 8"
170+ }
171+ ]
172+ },
173+ "confidence" : 0.99258333 ,
174+ "boundingPoly" : {
175+ "vertices" : [
176+ {
177+ "x" : 1310 ,
178+ "y" : 220
179+ },
180+ {
181+ "x" : 1534 ,
182+ "y" : 220
183+ },
184+ {
185+ "x" : 1534 ,
186+ "y" : 282
187+ },
188+ {
189+ "x" : 1310 ,
190+ "y" : 282
191+ }
192+ ],
193+ "normalizedVertices" : [
194+ {
195+ "x" : 0.74516493 ,
196+ "y" : 0.0967033
197+ },
198+ {
199+ "x" : 0.8725825 ,
200+ "y" : 0.0967033
201+ },
202+ {
203+ "x" : 0.8725825 ,
204+ "y" : 0.12395605
205+ },
206+ {
207+ "x" : 0.74516493 ,
208+ "y" : 0.12395605
209+ }
210+ ]
211+ },
212+ "orientation" : 1
213+ },
214+ "detectedLanguages" : [
215+ {
216+ "languageCode" : " en"
217+ }
218+ ]
219+ }
220+ ],
221+ "tokens" : [
222+ {
223+ "layout" : {
224+ "textAnchor" : {
225+ "textSegments" : [
226+ {
227+ "endIndex" : " 8"
228+ }
229+ ]
230+ },
231+ "confidence" : 0.99258333 ,
232+ "boundingPoly" : {
233+ "vertices" : [
234+ {
235+ "x" : 1310 ,
236+ "y" : 220
237+ },
238+ {
239+ "x" : 1534 ,
240+ "y" : 220
241+ },
242+ {
243+ "x" : 1534 ,
244+ "y" : 282
245+ },
246+ {
247+ "x" : 1310 ,
248+ "y" : 282
249+ }
250+ ],
251+ "normalizedVertices" : [
252+ {
253+ "x" : 0.74516493 ,
254+ "y" : 0.0967033
255+ },
256+ {
257+ "x" : 0.8725825 ,
258+ "y" : 0.0967033
259+ },
260+ {
261+ "x" : 0.8725825 ,
262+ "y" : 0.12395605
263+ },
264+ {
265+ "x" : 0.74516493 ,
266+ "y" : 0.12395605
267+ }
268+ ]
269+ },
270+ "orientation" : 1
271+ },
272+ "detectedLanguages" : [
273+ {
274+ "languageCode" : " en"
275+ }
276+ ]
277+ }
278+ ]
279+ }
280+ ],
281+ "shardInfo" : {
282+ "shardCount" : " 1"
283+ }
284+ }
Original file line number Diff line number Diff line change 1+ <?xml version =" 1.0" encoding =" UTF-8" ?>
2+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3+ <html xmlns =" http://www.w3.org/1999/xhtml" xml : lang =" unknown" lang =" unknown" >
4+ <head >
5+ <title >hocr-escape</title >
6+ <meta http-equiv =" Content-Type" content =" text/html;charset=utf-8" />
7+ <meta name =" ocr-system" content =" Document AI OCR" />
8+ <meta name =" ocr-langs" content =" unknown" />
9+ <meta name =" ocr-scripts" content =" unknown" />
10+ <meta name =" ocr-number-of-pages" content =" 1" />
11+ <meta name =" ocr-capabilities" content =" ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
12+ </head >
13+ <body >
14+ <div class =' ocr_page' lang =' unknown' title =' bbox 0 0 1758 2275' ><span class =' ocr_carea' id =' block_1_0' title =' bbox 1310 220 1534 282' ><p class =' ocr_par' id =' par_1_0_0' title =' bbox 1310 220 1534 282' ><span class =' ocr_line' id =' line_1_0_0_0' title =' bbox 1310 220 1534 282' >< Invoice<span class =' ocrx_word' id =' word_1_0_0_0_0' title =' bbox 1310 220 1534 282' >< Invoice</span ></span ></p ></span ></div >
15+ </body >
16+ </html >
Original file line number Diff line number Diff line change 1717import json
1818import os
1919import shutil
20+ from xml .etree import ElementTree
2021
2122# try/except added for compatibility with python < 3.8
2223try :
@@ -791,6 +792,9 @@ def test_export_hocr_str():
791792 actual_hocr = wrapped_document .export_hocr_str (title = "toolbox_invoice_test-0" )
792793 assert actual_hocr
793794
795+ element = ElementTree .fromstring (actual_hocr )
796+ assert element is not None
797+
794798 with open (
795799 "tests/unit/resources/toolbox_invoice_test_0_hocr.xml" , "r" , encoding = "utf-8"
796800 ) as f :
@@ -808,6 +812,30 @@ def test_export_hocr_str_with_blank_document():
808812
809813 assert actual_hocr
810814
815+ element = ElementTree .fromstring (actual_hocr )
816+ assert element is not None
817+
818+
819+ def test_export_hocr_str_with_escape_characters ():
820+ wrapped_document = document .Document .from_document_path (
821+ document_path = "tests/unit/resources/toolbox_invoice_test-0-hocr-escape.json"
822+ )
823+
824+ actual_hocr = wrapped_document .export_hocr_str (title = "hocr-escape" )
825+ assert actual_hocr
826+
827+ element = ElementTree .fromstring (actual_hocr )
828+ assert element is not None
829+
830+ with open (
831+ "tests/unit/resources/toolbox_invoice_test-0-hocr-escape.xml" ,
832+ "r" ,
833+ encoding = "utf-8" ,
834+ ) as f :
835+ expected = f .read ()
836+
837+ assert actual_hocr == expected
838+
811839
812840def test_document_to_merged_documentai_document (get_bytes_multiple_files_mock ):
813841 wrapped_document = document .Document .from_gcs (
You can’t perform that action at this time.
0 commit comments