diff --git a/README.rst b/README.rst index 361191a..2add60b 100644 --- a/README.rst +++ b/README.rst @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be specified with the ``-p`` or ``--postprocessor`` command line argument:: $ inscript https://www.fhgr.ch \ - -r ./annotation/examples/annotation-profile.json \ + -r ./examples/annotation/annotation-profile.json \ -p surface @@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors: - xml: returns an additional annotated text version:: + Chur Chur is the capital and largest town of the Swiss canton of the Grisons and lies in the Grisonian Rhine Valley. + - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below: @@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors: inscript --annotation-rules ./wikipedia.json \ --postprocessor html \ - https://en.wikipedia.org/wiki/Chur.html + https://en.wikipedia.org/wiki/Chur Annotation rules encoded in the ``wikipedia.json`` file: diff --git a/pyproject.toml b/pyproject.toml index 0edc368..f60795f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "inscriptis" -version = "2.5.3" +version = "2.6.0" authors = ["Albert Weichselbraun ", "Fabian Odoni "] description = "inscriptis - HTML to text converter." keywords = ["HTML", "converter", "text"] @@ -44,8 +44,11 @@ requests = ">=2.32.2" lxml = ">=4.9.3" # optional dependencies -fastapi = { version = "^0.109.1", optional = true } -uvicorn = { version = "^0.27.1", optional = true } +fastapi = { version = "^0.115.11", optional = true } +uvicorn = { version = "^0.34.0", optional = true } + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.5" [build-system] diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py index 41a7fb2..089d1a3 100644 --- a/src/inscriptis/annotation/output/__init__.py +++ b/src/inscriptis/annotation/output/__init__.py @@ -10,9 +10,9 @@ 2. The overwritten :meth:`__call__` method may either extend the original dictionary which contains the extracted text and annotations (e.g., :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or - may replace it with an custom output (e.g., + may replace it with a custom output (e.g., :class:`~inscriptis.annotation.output.html.HtmlExtractor` and - :class:`~inscriptis.annotation.output.xml.XmlExtractor`. + :class:`~inscriptis.annotation.output.xml.XmlExtractor`). Currently, Inscriptis supports the following built-in AnnotationProcessors: @@ -25,6 +25,7 @@ of the extracted annotations. """ + from typing import Dict, Any diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index f7da4a8..8ee5c4a 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -1,4 +1,5 @@ """HTML Annotation Processor.""" + from collections import defaultdict from itertools import cycle from typing import Dict, Any, List @@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor): verbatim = True def __call__(self, annotated_text: Dict[str, Any]) -> str: - tag_indices = defaultdict(list) + tag_dict = defaultdict(list) - for start, end, label in sorted(annotated_text["label"]): - tag_indices[start].append(label) - tag_indices[end].append("/" + label) + for start, end, label in reversed(annotated_text["label"]): + tag_dict[start].append( + f'{label}' + ) + tag_dict[end].insert(0, "") - open_tags = [] tagged_content = [ "
",
         ]
-        for idx, ch in enumerate(annotated_text["text"]):
-            if idx in tag_indices:
-                tags = tag_indices[idx]
-                # close tags:
-                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
-                    open_tags.pop()
-                    tagged_content.append("")
-                # open tags
-                for tag in (
-                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
-                ):
-                    open_tags.append(tag)
-                    tagged_content.append(
-                        '{tag}'
-                        ''.format(tag=tag)
-                    )
-
-            if ch == "\n":
-                tagged_content.extend(["" for _ in open_tags])
-                tagged_content.append("
\n
")
-                tagged_content.extend(
-                    [''.format(tag=tag) for tag in open_tags]
-                )
-            else:
-                tagged_content.append(ch)
 
+        text = annotated_text["text"]
+        current_idx = 0
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx].replace("\n", "
\n
"))
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:].replace("\n", "
\n")) return "".join(tagged_content) + "" @staticmethod diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py index c31aa06..a791f8c 100644 --- a/src/inscriptis/annotation/output/xml.py +++ b/src/inscriptis/annotation/output/xml.py @@ -1,4 +1,5 @@ """XML Annotation processor.""" + from collections import defaultdict from typing import Dict, Any @@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor): verbatim = True - def __call__(self, annotated_text: Dict[str, Any]) -> str: - """Provide an XML version of the given text and annotations. - - Args: - annotated_text: a dictionary containing the plain text and the - extracted annotations. - - Returns: - A string with the XML-version of the content. - """ - tag_indices = defaultdict(list) - - for start, end, label in sorted(annotated_text["label"]): - tag_indices[start].append(label) - tag_indices[end].append("/" + label) + def __call__(self, annotated_text: Dict[str, Any], root_element="content"): + tag_dict = defaultdict(list) + for start, end, tag in reversed(annotated_text["label"]): + tag_dict[start].append(f"<{tag}>") + tag_dict[end].insert(0, f"") current_idx = 0 - tagged_content = ['\n'] text = annotated_text["text"] - for index, tags in sorted(tag_indices.items()): - tagged_content.append(text[current_idx:index]) - # close tags - tagged_content.extend( - [ - "<" + tag + ">" - for tag in sorted(tags, reverse=True) - if tag.startswith("/") - ] - ) - # open tags - tagged_content.extend( - ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")] - ) - current_idx = index - tagged_content.append(text[current_idx:]) + tagged_content = ['\n', "\n"] + for idx, tags in sorted(tag_dict.items()): + tagged_content.append(text[current_idx:idx]) + current_idx = idx + tagged_content.extend(tags) + tagged_content.append(text[current_idx:]) + tagged_content.append("\n") return "".join(tagged_content) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 42d849e..1d31560 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -51,7 +51,9 @@ class Inscriptis: text = parser.get_text() """ - def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None): + def __init__( + self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None + ) -> None: # use the default configuration, if no config object is provided config = config or ParserConfig() diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py index 82fdc7a..b5a6e61 100644 --- a/tests/test_annotation_output_processor.py +++ b/tests/test_annotation_output_processor.py @@ -15,7 +15,7 @@ "text": "Chur\n\nChur is the capital and largest town of " "the Swiss canton of the Grisons and lies in the " "Grisonian Rhine Valley.", - "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]], + "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]], } @@ -36,8 +36,8 @@ def test_surface_annotator(): # and we have additional information on surface forms :) assert result["surface"] == [ - ("heading", "Chur"), ("h1", "Chur"), + ("heading", "Chur"), ("emphasis", "Chur"), ] @@ -48,11 +48,11 @@ def test_xml_annotator(): # and we have additional information on surface forms :) assert result == ( - '\n' - "

Chur

\n\n" + '\n\n' + "

Chur

\n\n" "Chur is the capital and largest town " "of the Swiss canton of the Grisons and lies in " - "the Grisonian Rhine Valley." + "the Grisonian Rhine Valley.\n
" ) @@ -60,9 +60,9 @@ def test_html_annotator(): processor = HtmlExtractor() result = processor(EXAMPLE_OUTPUT) + assert result.startswith("" + assert result.split("")[1] == ("" '
heading'
         ''
         'h1'
@@ -72,8 +72,7 @@ def test_html_annotator():
         'Chur is the capital '
         "and largest town of the Swiss canton of the "
         "Grisons and lies in the Grisonian Rhine Valley."
-        "
" - ) + "") def test_trailing_tag_annotation(): @@ -81,6 +80,6 @@ def test_trailing_tag_annotation(): result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]}) assert result == ( - '\n' - "Ehre sei Gott!" + '\n\n' + "Ehre sei Gott!\n" ) diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py new file mode 100644 index 0000000..cf3338a --- /dev/null +++ b/tests/test_annotation_output_xml.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +""" +Test the annotation XmlExtractor. +""" +from lxml.html import fromstring + +from inscriptis import Inscriptis, ParserConfig +from inscriptis.annotation.output.xml import XmlExtractor + + +def test_tag_error_issue_93(): + """ + Test for the correct tag order in the XmlOutput as described in Issue #93. + """ + html_issue_93 = """ + +
+ Item1 + Item2 + Item3 + Item4 +
+ + """ + + expected_output_issue_93 = ( + """\n\n""" + " Item1 Item2 Item3 " + "Item4\n" + ) + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} + + inscriptis = Inscriptis( + fromstring(html_issue_93), ParserConfig(annotation_rules=rules) + ) + annotated_html = { + "text": inscriptis.get_text(), + "label": inscriptis.get_annotations(), + } + result = XmlExtractor()(annotated_html) + assert result == expected_output_issue_93 + + +def test_tag_folding_issue_93_extended(): + html_issue_93 = """ + +
+ Some Test to add :) + Item1 + Item2 + Item3 + Item4 +
+ + """ + + expected_output_issue_93 = ( + """\n""" + """\n""" + """ Some Test to add :) Item 1 Item2 """ + """Item3 It e m4\n""" + """""" + ) + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]} + + inscriptis = Inscriptis( + fromstring(html_issue_93), ParserConfig(annotation_rules=rules) + ) + annotated_html = { + "text": inscriptis.get_text(), + "label": inscriptis.get_annotations(), + } + result = XmlExtractor()(annotated_html) + assert result == expected_output_issue_93