Skip to content
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
specified with the ``-p`` or ``--postprocessor`` command line argument::

$ inscript https://www.fhgr.ch \
-r ./annotation/examples/annotation-profile.json \
-r ./examples/annotation/annotation-profile.json \
-p surface


Expand Down Expand Up @@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
- xml: returns an additional annotated text version::

<?xml version="1.0" encoding="UTF-8" ?>
<content>
<heading>Chur</heading>

<emphasis>Chur</emphasis> is the capital and largest town of the Swiss
canton of the Grisons and lies in the Grisonian Rhine Valley.
</content>

- html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:

Expand All @@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:

inscript --annotation-rules ./wikipedia.json \
--postprocessor html \
https://en.wikipedia.org/wiki/Chur.html
https://en.wikipedia.org/wiki/Chur

Annotation rules encoded in the ``wikipedia.json`` file:

Expand Down
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "inscriptis"
version = "2.5.3"
version = "2.6.0"
authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
description = "inscriptis - HTML to text converter."
keywords = ["HTML", "converter", "text"]
Expand Down Expand Up @@ -44,8 +44,11 @@ requests = ">=2.32.2"
lxml = ">=4.9.3"

# optional dependencies
fastapi = { version = "^0.109.1", optional = true }
uvicorn = { version = "^0.27.1", optional = true }
fastapi = { version = "^0.115.11", optional = true }
uvicorn = { version = "^0.34.0", optional = true }

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.5"


[build-system]
Expand Down
5 changes: 3 additions & 2 deletions src/inscriptis/annotation/output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
2. The overwritten :meth:`__call__` method may either extend the original
dictionary which contains the extracted text and annotations (e.g.,
:class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
may replace it with an custom output (e.g.,
may replace it with a custom output (e.g.,
:class:`~inscriptis.annotation.output.html.HtmlExtractor` and
:class:`~inscriptis.annotation.output.xml.XmlExtractor`.
:class:`~inscriptis.annotation.output.xml.XmlExtractor`).

Currently, Inscriptis supports the following built-in AnnotationProcessors:

Expand All @@ -25,6 +25,7 @@
of the extracted annotations.

"""

from typing import Dict, Any


Expand Down
44 changes: 14 additions & 30 deletions src/inscriptis/annotation/output/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""HTML Annotation Processor."""

from collections import defaultdict
from itertools import cycle
from typing import Dict, Any, List
Expand All @@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
verbatim = True

def __call__(self, annotated_text: Dict[str, Any]) -> str:
tag_indices = defaultdict(list)
tag_dict = defaultdict(list)

for start, end, label in sorted(annotated_text["label"]):
tag_indices[start].append(label)
tag_indices[end].append("/" + label)
for start, end, label in reversed(annotated_text["label"]):
tag_dict[start].append(
f'<span class="{label}-label">{label}</span><span class="{label}">'
)
tag_dict[end].insert(0, "</span>")

open_tags = []
tagged_content = [
"<html><head><style>",
self._get_css(annotated_text["label"]),
"</style></head><body><pre>",
]
for idx, ch in enumerate(annotated_text["text"]):
if idx in tag_indices:
tags = tag_indices[idx]
# close tags:
for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
open_tags.pop()
tagged_content.append("</span>")
# open tags
for tag in (
t for t in sorted(tags, reverse=True) if not t.startswith("/")
):
open_tags.append(tag)
tagged_content.append(
'<span class="{tag}-label">{tag}</span>'
'<span class="{tag}">'.format(tag=tag)
)

if ch == "\n":
tagged_content.extend(["</span>" for _ in open_tags])
tagged_content.append("</pre>\n<pre>")
tagged_content.extend(
['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
)
else:
tagged_content.append(ch)

text = annotated_text["text"]
current_idx = 0
for idx, tags in sorted(tag_dict.items()):
tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
current_idx = idx
tagged_content.extend(tags)
tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
return "".join(tagged_content) + "</pre></body></html>"

@staticmethod
Expand Down
45 changes: 13 additions & 32 deletions src/inscriptis/annotation/output/xml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""XML Annotation processor."""

from collections import defaultdict
from typing import Dict, Any

Expand All @@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor):

verbatim = True

def __call__(self, annotated_text: Dict[str, Any]) -> str:
"""Provide an XML version of the given text and annotations.

Args:
annotated_text: a dictionary containing the plain text and the
extracted annotations.

Returns:
A string with the XML-version of the content.
"""
tag_indices = defaultdict(list)

for start, end, label in sorted(annotated_text["label"]):
tag_indices[start].append(label)
tag_indices[end].append("/" + label)
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
tag_dict = defaultdict(list)
for start, end, tag in reversed(annotated_text["label"]):
tag_dict[start].append(f"<{tag}>")
tag_dict[end].insert(0, f"</{tag}>")

current_idx = 0
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
text = annotated_text["text"]
for index, tags in sorted(tag_indices.items()):
tagged_content.append(text[current_idx:index])
# close tags
tagged_content.extend(
[
"<" + tag + ">"
for tag in sorted(tags, reverse=True)
if tag.startswith("/")
]
)
# open tags
tagged_content.extend(
["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
)
current_idx = index
tagged_content.append(text[current_idx:])
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
for idx, tags in sorted(tag_dict.items()):
tagged_content.append(text[current_idx:idx])
current_idx = idx
tagged_content.extend(tags)

tagged_content.append(text[current_idx:])
tagged_content.append("\n</content>")
return "".join(tagged_content)
4 changes: 3 additions & 1 deletion src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ class Inscriptis:
text = parser.get_text()
"""

def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
def __init__(
self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
) -> None:
# use the default configuration, if no config object is provided
config = config or ParserConfig()

Expand Down
21 changes: 10 additions & 11 deletions tests/test_annotation_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"text": "Chur\n\nChur is the capital and largest town of "
"the Swiss canton of the Grisons and lies in the "
"Grisonian Rhine Valley.",
"label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
"label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
}


Expand All @@ -36,8 +36,8 @@

# and we have additional information on surface forms :)
assert result["surface"] == [
("heading", "Chur"),
("h1", "Chur"),
("heading", "Chur"),
("emphasis", "Chur"),
]

Expand All @@ -48,21 +48,21 @@

# and we have additional information on surface forms :)
assert result == (
'<?xml version="1.0" encoding="UTF-8" ?>\n'
"<h1><heading>Chur</heading></h1>\n\n<emphasis>"
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
"<heading><h1>Chur</h1></heading>\n\n<emphasis>"
"Chur</emphasis> is the capital and largest town "
"of the Swiss canton of the Grisons and lies in "
"the Grisonian Rhine Valley."
"the Grisonian Rhine Valley.\n</content>"
)


def test_html_annotator():
processor = HtmlExtractor()
result = processor(EXAMPLE_OUTPUT)


assert result.startswith("<html><head><style>")
assert result.endswith(
"</style></head>"
assert result.split("</style>")[1] == ("</head>"

Check notice on line 65 in tests/test_annotation_output_processor.py

View check run for this annotation

codefactor.io / CodeFactor

tests/test_annotation_output_processor.py#L65

Multiple spaces after operator. (E222)
'<body><pre><span class="heading-label">heading'
'</span><span class="heading">'
'<span class="h1-label">h1</span><span class="h1">'
Expand All @@ -72,15 +72,14 @@
'<span class="emphasis">Chur</span> is the capital '
"and largest town of the Swiss canton of the "
"Grisons and lies in the Grisonian Rhine Valley."
"</pre></body></html>"
)
"</pre></body></html>")


def test_trailing_tag_annotation():
processor = XmlExtractor()
result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})

assert result == (
'<?xml version="1.0" encoding="UTF-8" ?>\n'
"Ehre sei <emphasis>Gott!</emphasis>"
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
"Ehre sei <emphasis>Gott!</emphasis>\n</content>"
)
75 changes: 75 additions & 0 deletions tests/test_annotation_output_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python

"""
Test the annotation XmlExtractor.
"""
from lxml.html import fromstring

from inscriptis import Inscriptis, ParserConfig
from inscriptis.annotation.output.xml import XmlExtractor


def test_tag_error_issue_93():
"""
Test for the correct tag order in the XmlOutput as described in Issue #93.
"""
html_issue_93 = """<html>
<body>
<div class="a">
<span class="b">Item1</span>
<span class="b">Item2</span>
<span class="b">Item3</span>
<span class="b">Item4</span>
</div>
</body>
</html>"""

expected_output_issue_93 = (
"""<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
"<outer><inner> Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
"<inner>Item4</inner></outer>\n</content>"
)
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}

inscriptis = Inscriptis(
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
)
annotated_html = {
"text": inscriptis.get_text(),
"label": inscriptis.get_annotations(),
}
result = XmlExtractor()(annotated_html)
assert result == expected_output_issue_93


def test_tag_folding_issue_93_extended():
html_issue_93 = """<html>
<body>
<div class="a">
Some Test to add :)
<span class="b">Item<b>1</b></span>
<span class="b">Item2</span>
<span class="b"><b>Item3</b></span>
<span class="b"><b>It</b>e<b>m4</b></span>
</div>
</body>
</html>"""

expected_output_issue_93 = (
"""<?xml version="1.0" encoding="UTF-8" ?>\n"""
"""<content>\n"""
"""<outer> Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
"""<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
"""</content>"""
)
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}

inscriptis = Inscriptis(
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
)
annotated_html = {
"text": inscriptis.get_text(),
"label": inscriptis.get_annotations(),
}
result = XmlExtractor()(annotated_html)
assert result == expected_output_issue_93
Loading