-
Notifications
You must be signed in to change notification settings - Fork 34
Expand file tree
/
Copy pathtest_annotation_output_processor.py
More file actions
85 lines (64 loc) · 2.53 KB
/
test_annotation_output_processor.py
File metadata and controls
85 lines (64 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
"""
Test the annotation output formatter.
"""
import pytest
from inscriptis.annotation.output import AnnotationProcessor
from inscriptis.annotation.output.html import HtmlExtractor
from inscriptis.annotation.output.surface import SurfaceExtractor
from inscriptis.annotation.output.xml import XmlExtractor
EXAMPLE_OUTPUT = {
"text": "Chur\n\nChur is the capital and largest town of "
"the Swiss canton of the Grisons and lies in the "
"Grisonian Rhine Valley.",
"label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
}
def test_abstract_class():
processor = AnnotationProcessor()
with pytest.raises(NotImplementedError):
result = processor(EXAMPLE_OUTPUT)
def test_surface_annotator():
processor = SurfaceExtractor()
result = processor(EXAMPLE_OUTPUT)
# the old keys haven't been changed
assert "text" in result
assert "label" in result
# and we have additional information on surface forms :)
assert result["surface"] == [
("h1", "Chur"),
("heading", "Chur"),
("emphasis", "Chur"),
]
def test_xml_annotator():
processor = XmlExtractor()
result = processor(EXAMPLE_OUTPUT)
# and we have additional information on surface forms :)
assert result == (
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
"<heading><h1>Chur</h1></heading>\n\n<emphasis>"
"Chur</emphasis> is the capital and largest town "
"of the Swiss canton of the Grisons and lies in "
"the Grisonian Rhine Valley.\n</content>"
)
def test_html_annotator():
processor = HtmlExtractor()
result = processor(EXAMPLE_OUTPUT)
assert result.startswith("<html><head><style>")
assert result.split("</style>")[1] == ("</head>"
'<body><pre><span class="heading-label">heading'
'</span><span class="heading">'
'<span class="h1-label">h1</span><span class="h1">'
"Chur</span></span></pre>\n"
"<pre></pre>\n"
'<pre><span class="emphasis-label">emphasis</span>'
'<span class="emphasis">Chur</span> is the capital '
"and largest town of the Swiss canton of the "
"Grisons and lies in the Grisonian Rhine Valley."
"</pre></body></html>")
def test_trailing_tag_annotation():
processor = XmlExtractor()
result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
assert result == (
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
"Ehre sei <emphasis>Gott!</emphasis>\n</content>"
)