diff --git a/README.rst b/README.rst index 361191a..2add60b 100644 --- a/README.rst +++ b/README.rst @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be specified with the ``-p`` or ``--postprocessor`` command line argument:: $ inscript https://www.fhgr.ch \ - -r ./annotation/examples/annotation-profile.json \ + -r ./examples/annotation/annotation-profile.json \ -p surface @@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors: - xml: returns an additional annotated text version:: + Chur Chur is the capital and largest town of the Swiss canton of the Grisons and lies in the Grisonian Rhine Valley. + - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below: @@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors: inscript --annotation-rules ./wikipedia.json \ --postprocessor html \ - https://en.wikipedia.org/wiki/Chur.html + https://en.wikipedia.org/wiki/Chur Annotation rules encoded in the ``wikipedia.json`` file: diff --git a/pyproject.toml b/pyproject.toml index 0edc368..f60795f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "inscriptis" -version = "2.5.3" +version = "2.6.0" authors = ["Albert Weichselbraun ", "Fabian Odoni "] description = "inscriptis - HTML to text converter." keywords = ["HTML", "converter", "text"] @@ -44,8 +44,11 @@ requests = ">=2.32.2" lxml = ">=4.9.3" # optional dependencies -fastapi = { version = "^0.109.1", optional = true } -uvicorn = { version = "^0.27.1", optional = true } +fastapi = { version = "^0.115.11", optional = true } +uvicorn = { version = "^0.34.0", optional = true } + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.5" [build-system] diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py index 41a7fb2..089d1a3 100644 --- a/src/inscriptis/annotation/output/__init__.py +++ b/src/inscriptis/annotation/output/__init__.py @@ -10,9 +10,9 @@ 2. The overwritten :meth:`__call__` method may either extend the original dictionary which contains the extracted text and annotations (e.g., :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or - may replace it with an custom output (e.g., + may replace it with a custom output (e.g., :class:`~inscriptis.annotation.output.html.HtmlExtractor` and - :class:`~inscriptis.annotation.output.xml.XmlExtractor`. + :class:`~inscriptis.annotation.output.xml.XmlExtractor`). Currently, Inscriptis supports the following built-in AnnotationProcessors: @@ -25,6 +25,7 @@ of the extracted annotations. """ + from typing import Dict, Any diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index f7da4a8..8ee5c4a 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -1,4 +1,5 @@ """HTML Annotation Processor.""" + from collections import defaultdict from itertools import cycle from typing import Dict, Any, List @@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor): verbatim = True def __call__(self, annotated_text: Dict[str, Any]) -> str: - tag_indices = defaultdict(list) + tag_dict = defaultdict(list) - for start, end, label in sorted(annotated_text["label"]): - tag_indices[start].append(label) - tag_indices[end].append("/" + label) + for start, end, label in reversed(annotated_text["label"]): + tag_dict[start].append( + f'{label}' + ) + tag_dict[end].insert(0, "") - open_tags = [] tagged_content = [ "
",
         ]
-        for idx, ch in enumerate(annotated_text["text"]):
-            if idx in tag_indices:
-                tags = tag_indices[idx]
-                # close tags:
-                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
-                    open_tags.pop()
-                    tagged_content.append("")
-                # open tags
-                for tag in (
-                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
-                ):
-                    open_tags.append(tag)
-                    tagged_content.append(
-                        '{tag}'
-                        ''.format(tag=tag)
-                    )
-
-            if ch == "\n":
-                tagged_content.extend(["" for _ in open_tags])
-                tagged_content.append("
\n
")
-                tagged_content.extend(
-                    [''.format(tag=tag) for tag in open_tags]
-                )
-            else:
-                tagged_content.append(ch)
 
+        text = annotated_text["text"]
+        current_idx = 0
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx].replace("\n", "
\n
"))
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:].replace("\n", "
\n")) return "".join(tagged_content) + "" @staticmethod diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py index c31aa06..a791f8c 100644 --- a/src/inscriptis/annotation/output/xml.py +++ b/src/inscriptis/annotation/output/xml.py @@ -1,4 +1,5 @@ """XML Annotation processor.""" + from collections import defaultdict from typing import Dict, Any @@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor): verbatim = True - def __call__(self, annotated_text: Dict[str, Any]) -> str: - """Provide an XML version of the given text and annotations. - - Args: - annotated_text: a dictionary containing the plain text and the - extracted annotations. - - Returns: - A string with the XML-version of the content. - """ - tag_indices = defaultdict(list) - - for start, end, label in sorted(annotated_text["label"]): - tag_indices[start].append(label) - tag_indices[end].append("/" + label) + def __call__(self, annotated_text: Dict[str, Any], root_element="content"): + tag_dict = defaultdict(list) + for start, end, tag in reversed(annotated_text["label"]): + tag_dict[start].append(f"<{tag}>") + tag_dict[end].insert(0, f"") current_idx = 0 - tagged_content = ['\n'] text = annotated_text["text"] - for index, tags in sorted(tag_indices.items()): - tagged_content.append(text[current_idx:index]) - # close tags - tagged_content.extend( - [ - "<" + tag + ">" - for tag in sorted(tags, reverse=True) - if tag.startswith("/") - ] - ) - # open tags - tagged_content.extend( - ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")] - ) - current_idx = index - tagged_content.append(text[current_idx:]) + tagged_content = ['\n', "\n"] + for idx, tags in sorted(tag_dict.items()): + tagged_content.append(text[current_idx:idx]) + current_idx = idx + tagged_content.extend(tags) + tagged_content.append(text[current_idx:]) + tagged_content.append("\n") return "".join(tagged_content) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 42d849e..1d31560 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -51,7 +51,9 @@ class Inscriptis: text = parser.get_text() """ - def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None): + def __init__( + self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None + ) -> None: # use the default configuration, if no config object is provided config = config or ParserConfig() diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py index 67b9050..699d3e2 100644 --- a/tests/test_annotation_engine.py +++ b/tests/test_annotation_engine.py @@ -11,9 +11,12 @@ def test_get_annotation(): """Test get_anntation from the Inscriptis class""" html = "Chur is a City in Switzerland" - rules = {'b': ['bold']} + rules = {"b": ["bold"]} inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules)) assert inscriptis.get_text() == "Chur is a City in Switzerland" - assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')] + assert inscriptis.get_annotations() == [ + Annotation(start=0, end=4, metadata="bold"), + Annotation(start=18, end=29, metadata="bold"), + ] diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py index 82fdc7a..2613566 100644 --- a/tests/test_annotation_output_processor.py +++ b/tests/test_annotation_output_processor.py @@ -15,7 +15,7 @@ "text": "Chur\n\nChur is the capital and largest town of " "the Swiss canton of the Grisons and lies in the " "Grisonian Rhine Valley.", - "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]], + "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]], } @@ -36,8 +36,8 @@ def test_surface_annotator(): # and we have additional information on surface forms :) assert result["surface"] == [ - ("heading", "Chur"), ("h1", "Chur"), + ("heading", "Chur"), ("emphasis", "Chur"), ] @@ -48,11 +48,11 @@ def test_xml_annotator(): # and we have additional information on surface forms :) assert result == ( - '\n' - "

Chur

\n\n" + '\n\n' + "

Chur

\n\n" "Chur is the capital and largest town " "of the Swiss canton of the Grisons and lies in " - "the Grisonian Rhine Valley." + "the Grisonian Rhine Valley.\n
" ) @@ -61,8 +61,8 @@ def test_html_annotator(): result = processor(EXAMPLE_OUTPUT) assert result.startswith("" + assert result.split("")[1] == ( + "" '
heading'
         ''
         'h1'
@@ -81,6 +81,6 @@ def test_trailing_tag_annotation():
     result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
 
     assert result == (
-        '\n'
-        "Ehre sei Gott!"
+        '\n\n'
+        "Ehre sei Gott!\n"
     )
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
new file mode 100644
index 0000000..cf3338a
--- /dev/null
+++ b/tests/test_annotation_output_xml.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+"""
+Test the annotation XmlExtractor.
+"""
+from lxml.html import fromstring
+
+from inscriptis import Inscriptis, ParserConfig
+from inscriptis.annotation.output.xml import XmlExtractor
+
+
+def test_tag_error_issue_93():
+    """
+    Test for the correct tag order in the XmlOutput as described in Issue #93.
+    """
+    html_issue_93 = """
+       
+         
+ Item1 + Item2 + Item3 + Item4 +
+ + """ + + expected_output_issue_93 = ( + """\n\n""" + " Item1 Item2 Item3 " + "Item4\n" + ) + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} + + inscriptis = Inscriptis( + fromstring(html_issue_93), ParserConfig(annotation_rules=rules) + ) + annotated_html = { + "text": inscriptis.get_text(), + "label": inscriptis.get_annotations(), + } + result = XmlExtractor()(annotated_html) + assert result == expected_output_issue_93 + + +def test_tag_folding_issue_93_extended(): + html_issue_93 = """ + +
+ Some Test to add :) + Item1 + Item2 + Item3 + Item4 +
+ + """ + + expected_output_issue_93 = ( + """\n""" + """\n""" + """ Some Test to add :) Item 1 Item2 """ + """Item3 It e m4\n""" + """""" + ) + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]} + + inscriptis = Inscriptis( + fromstring(html_issue_93), ParserConfig(annotation_rules=rules) + ) + annotated_html = { + "text": inscriptis.get_text(), + "label": inscriptis.get_annotations(), + } + result = XmlExtractor()(annotated_html) + assert result == expected_output_issue_93 diff --git a/tests/test_block.py b/tests/test_block.py index 8aacc93..4ce3f7e 100644 --- a/tests/test_block.py +++ b/tests/test_block.py @@ -1,6 +1,7 @@ """ Test cases for the Block class. """ + from inscriptis.model.canvas.block import Block from inscriptis.model.canvas.prefix import Prefix diff --git a/tests/test_cli.py b/tests/test_cli.py index 4e4cfc4..0c86198 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,7 @@ """ Tests the Inscriptis CLI client. """ + from io import StringIO from pathlib import Path from json import loads diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py index d050e6a..342f54c 100644 --- a/tests/test_custom_html_tag_handling.py +++ b/tests/test_custom_html_tag_handling.py @@ -1,4 +1,5 @@ """Test the custom HTML tag handling.""" + from lxml.html import fromstring from inscriptis import Inscriptis, ParserConfig