Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 2d9f05b

Browse files
authored
fix: Escape html special characters in hocr_document_template.xml.j2 (#279)
* fix: Escape html special characters in hocr_document_template.xml.j2 * test: Add Unit test for hOCR XML validity.
1 parent 71191ab commit 2d9f05b

File tree

4 files changed

+330
-2
lines changed

4 files changed

+330
-2
lines changed

google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
{% set paridx = loop.index0 -%}
2020
<p class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
2121
{% set lidx = loop.index0 -%}
22-
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}{% for token in line.tokens -%}
22+
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text|escape }}{% for token in line.tokens -%}
2323
{% set tidx = loop.index0 -%}
24-
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}
24+
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text|escape }}</span>{% endfor -%}
2525
</span>{% endfor -%}
2626
</p>{% endfor -%}
2727
</span>{% endfor -%}
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
{
2+
"text": "<Invoice>",
3+
"pages": [
4+
{
5+
"pageNumber": 1,
6+
"dimension": {
7+
"width": 1758.0,
8+
"height": 2275.0,
9+
"unit": "pixels"
10+
},
11+
"layout": {
12+
"textAnchor": {
13+
"textSegments": [
14+
{
15+
"endIndex": "435"
16+
}
17+
]
18+
},
19+
"boundingPoly": {
20+
"vertices": [
21+
{},
22+
{
23+
"x": 1758
24+
},
25+
{
26+
"x": 1758,
27+
"y": 2275
28+
},
29+
{
30+
"y": 2275
31+
}
32+
],
33+
"normalizedVertices": [
34+
{},
35+
{
36+
"x": 1.0
37+
},
38+
{
39+
"x": 1.0,
40+
"y": 1.0
41+
},
42+
{
43+
"y": 1.0
44+
}
45+
]
46+
},
47+
"orientation": 1
48+
},
49+
"detectedLanguages": [
50+
{
51+
"languageCode": "en"
52+
},
53+
{
54+
"languageCode": "und"
55+
}
56+
],
57+
"blocks": [
58+
{
59+
"layout": {
60+
"textAnchor": {
61+
"textSegments": [
62+
{
63+
"endIndex": "8"
64+
}
65+
]
66+
},
67+
"confidence": 0.99258333,
68+
"boundingPoly": {
69+
"vertices": [
70+
{
71+
"x": 1310,
72+
"y": 220
73+
},
74+
{
75+
"x": 1534,
76+
"y": 220
77+
},
78+
{
79+
"x": 1534,
80+
"y": 282
81+
},
82+
{
83+
"x": 1310,
84+
"y": 282
85+
}
86+
],
87+
"normalizedVertices": [
88+
{
89+
"x": 0.74516493,
90+
"y": 0.0967033
91+
},
92+
{
93+
"x": 0.8725825,
94+
"y": 0.0967033
95+
},
96+
{
97+
"x": 0.8725825,
98+
"y": 0.12395605
99+
},
100+
{
101+
"x": 0.74516493,
102+
"y": 0.12395605
103+
}
104+
]
105+
},
106+
"orientation": 1
107+
}
108+
}
109+
],
110+
"paragraphs": [
111+
{
112+
"layout": {
113+
"textAnchor": {
114+
"textSegments": [
115+
{
116+
"endIndex": "8"
117+
}
118+
]
119+
},
120+
"confidence": 0.99258333,
121+
"boundingPoly": {
122+
"vertices": [
123+
{
124+
"x": 1310,
125+
"y": 220
126+
},
127+
{
128+
"x": 1534,
129+
"y": 220
130+
},
131+
{
132+
"x": 1534,
133+
"y": 282
134+
},
135+
{
136+
"x": 1310,
137+
"y": 282
138+
}
139+
],
140+
"normalizedVertices": [
141+
{
142+
"x": 0.74516493,
143+
"y": 0.0967033
144+
},
145+
{
146+
"x": 0.8725825,
147+
"y": 0.0967033
148+
},
149+
{
150+
"x": 0.8725825,
151+
"y": 0.12395605
152+
},
153+
{
154+
"x": 0.74516493,
155+
"y": 0.12395605
156+
}
157+
]
158+
},
159+
"orientation": 1
160+
}
161+
}
162+
],
163+
"lines": [
164+
{
165+
"layout": {
166+
"textAnchor": {
167+
"textSegments": [
168+
{
169+
"endIndex": "8"
170+
}
171+
]
172+
},
173+
"confidence": 0.99258333,
174+
"boundingPoly": {
175+
"vertices": [
176+
{
177+
"x": 1310,
178+
"y": 220
179+
},
180+
{
181+
"x": 1534,
182+
"y": 220
183+
},
184+
{
185+
"x": 1534,
186+
"y": 282
187+
},
188+
{
189+
"x": 1310,
190+
"y": 282
191+
}
192+
],
193+
"normalizedVertices": [
194+
{
195+
"x": 0.74516493,
196+
"y": 0.0967033
197+
},
198+
{
199+
"x": 0.8725825,
200+
"y": 0.0967033
201+
},
202+
{
203+
"x": 0.8725825,
204+
"y": 0.12395605
205+
},
206+
{
207+
"x": 0.74516493,
208+
"y": 0.12395605
209+
}
210+
]
211+
},
212+
"orientation": 1
213+
},
214+
"detectedLanguages": [
215+
{
216+
"languageCode": "en"
217+
}
218+
]
219+
}
220+
],
221+
"tokens": [
222+
{
223+
"layout": {
224+
"textAnchor": {
225+
"textSegments": [
226+
{
227+
"endIndex": "8"
228+
}
229+
]
230+
},
231+
"confidence": 0.99258333,
232+
"boundingPoly": {
233+
"vertices": [
234+
{
235+
"x": 1310,
236+
"y": 220
237+
},
238+
{
239+
"x": 1534,
240+
"y": 220
241+
},
242+
{
243+
"x": 1534,
244+
"y": 282
245+
},
246+
{
247+
"x": 1310,
248+
"y": 282
249+
}
250+
],
251+
"normalizedVertices": [
252+
{
253+
"x": 0.74516493,
254+
"y": 0.0967033
255+
},
256+
{
257+
"x": 0.8725825,
258+
"y": 0.0967033
259+
},
260+
{
261+
"x": 0.8725825,
262+
"y": 0.12395605
263+
},
264+
{
265+
"x": 0.74516493,
266+
"y": 0.12395605
267+
}
268+
]
269+
},
270+
"orientation": 1
271+
},
272+
"detectedLanguages": [
273+
{
274+
"languageCode": "en"
275+
}
276+
]
277+
}
278+
]
279+
}
280+
],
281+
"shardInfo": {
282+
"shardCount": "1"
283+
}
284+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="unknown" lang="unknown">
4+
<head>
5+
<title>hocr-escape</title>
6+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
7+
<meta name="ocr-system" content="Document AI OCR" />
8+
<meta name="ocr-langs" content="unknown" />
9+
<meta name="ocr-scripts" content="unknown" />
10+
<meta name="ocr-number-of-pages" content="1" />
11+
<meta name="ocr-capabilities" content="ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
12+
</head>
13+
<body>
14+
<div class='ocr_page' lang='unknown' title='bbox 0 0 1758 2275'><span class='ocr_carea' id='block_1_0' title='bbox 1310 220 1534 282'><p class='ocr_par' id='par_1_0_0' title='bbox 1310 220 1534 282'><span class='ocr_line' id='line_1_0_0_0' title='bbox 1310 220 1534 282'>&lt;Invoice<span class='ocrx_word' id='word_1_0_0_0_0' title='bbox 1310 220 1534 282'>&lt;Invoice</span></span></p></span></div>
15+
</body>
16+
</html>

tests/unit/test_document.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import json
1818
import os
1919
import shutil
20+
from xml.etree import ElementTree
2021

2122
# try/except added for compatibility with python < 3.8
2223
try:
@@ -791,6 +792,9 @@ def test_export_hocr_str():
791792
actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0")
792793
assert actual_hocr
793794

795+
element = ElementTree.fromstring(actual_hocr)
796+
assert element is not None
797+
794798
with open(
795799
"tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8"
796800
) as f:
@@ -808,6 +812,30 @@ def test_export_hocr_str_with_blank_document():
808812

809813
assert actual_hocr
810814

815+
element = ElementTree.fromstring(actual_hocr)
816+
assert element is not None
817+
818+
819+
def test_export_hocr_str_with_escape_characters():
820+
wrapped_document = document.Document.from_document_path(
821+
document_path="tests/unit/resources/toolbox_invoice_test-0-hocr-escape.json"
822+
)
823+
824+
actual_hocr = wrapped_document.export_hocr_str(title="hocr-escape")
825+
assert actual_hocr
826+
827+
element = ElementTree.fromstring(actual_hocr)
828+
assert element is not None
829+
830+
with open(
831+
"tests/unit/resources/toolbox_invoice_test-0-hocr-escape.xml",
832+
"r",
833+
encoding="utf-8",
834+
) as f:
835+
expected = f.read()
836+
837+
assert actual_hocr == expected
838+
811839

812840
def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock):
813841
wrapped_document = document.Document.from_gcs(

0 commit comments

Comments
 (0)