"]
description = "inscriptis - HTML to text converter."
keywords = ["HTML", "converter", "text"]
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
lxml = ">=4.9.3"
# optional dependencies
-fastapi = { version = "^0.109.1", optional = true }
-uvicorn = { version = "^0.27.1", optional = true }
+fastapi = { version = "^0.115.11", optional = true }
+uvicorn = { version = "^0.34.0", optional = true }
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"
[build-system]
diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py
index 41a7fb2..089d1a3 100644
--- a/src/inscriptis/annotation/output/__init__.py
+++ b/src/inscriptis/annotation/output/__init__.py
@@ -10,9 +10,9 @@
2. The overwritten :meth:`__call__` method may either extend the original
dictionary which contains the extracted text and annotations (e.g.,
:class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
- may replace it with an custom output (e.g.,
+ may replace it with a custom output (e.g.,
:class:`~inscriptis.annotation.output.html.HtmlExtractor` and
- :class:`~inscriptis.annotation.output.xml.XmlExtractor`.
+ :class:`~inscriptis.annotation.output.xml.XmlExtractor`).
Currently, Inscriptis supports the following built-in AnnotationProcessors:
@@ -25,6 +25,7 @@
of the extracted annotations.
"""
+
from typing import Dict, Any
diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py
index f7da4a8..8ee5c4a 100644
--- a/src/inscriptis/annotation/output/html.py
+++ b/src/inscriptis/annotation/output/html.py
@@ -1,4 +1,5 @@
"""HTML Annotation Processor."""
+
from collections import defaultdict
from itertools import cycle
from typing import Dict, Any, List
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
verbatim = True
def __call__(self, annotated_text: Dict[str, Any]) -> str:
- tag_indices = defaultdict(list)
+ tag_dict = defaultdict(list)
- for start, end, label in sorted(annotated_text["label"]):
- tag_indices[start].append(label)
- tag_indices[end].append("/" + label)
+ for start, end, label in reversed(annotated_text["label"]):
+ tag_dict[start].append(
+ f'{label}'
+ )
+ tag_dict[end].insert(0, "")
- open_tags = []
tagged_content = [
"",
]
- for idx, ch in enumerate(annotated_text["text"]):
- if idx in tag_indices:
- tags = tag_indices[idx]
- # close tags:
- for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
- open_tags.pop()
- tagged_content.append("")
- # open tags
- for tag in (
- t for t in sorted(tags, reverse=True) if not t.startswith("/")
- ):
- open_tags.append(tag)
- tagged_content.append(
- '{tag}'
- ''.format(tag=tag)
- )
-
- if ch == "\n":
- tagged_content.extend(["" for _ in open_tags])
- tagged_content.append("\n")
- tagged_content.extend(
- [''.format(tag=tag) for tag in open_tags]
- )
- else:
- tagged_content.append(ch)
+ text = annotated_text["text"]
+ current_idx = 0
+ for idx, tags in sorted(tag_dict.items()):
+ tagged_content.append(text[current_idx:idx].replace("\n", "\n"))
+ current_idx = idx
+ tagged_content.extend(tags)
+ tagged_content.append(text[current_idx:].replace("\n", "\n"))
return "".join(tagged_content) + ""
@staticmethod
diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
index c31aa06..a791f8c 100644
--- a/src/inscriptis/annotation/output/xml.py
+++ b/src/inscriptis/annotation/output/xml.py
@@ -1,4 +1,5 @@
"""XML Annotation processor."""
+
from collections import defaultdict
from typing import Dict, Any
@@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor):
verbatim = True
- def __call__(self, annotated_text: Dict[str, Any]) -> str:
- """Provide an XML version of the given text and annotations.
-
- Args:
- annotated_text: a dictionary containing the plain text and the
- extracted annotations.
-
- Returns:
- A string with the XML-version of the content.
- """
- tag_indices = defaultdict(list)
-
- for start, end, label in sorted(annotated_text["label"]):
- tag_indices[start].append(label)
- tag_indices[end].append("/" + label)
+ def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
+ tag_dict = defaultdict(list)
+ for start, end, tag in reversed(annotated_text["label"]):
+ tag_dict[start].append(f"<{tag}>")
+ tag_dict[end].insert(0, f"{tag}>")
current_idx = 0
- tagged_content = ['\n']
text = annotated_text["text"]
- for index, tags in sorted(tag_indices.items()):
- tagged_content.append(text[current_idx:index])
- # close tags
- tagged_content.extend(
- [
- "<" + tag + ">"
- for tag in sorted(tags, reverse=True)
- if tag.startswith("/")
- ]
- )
- # open tags
- tagged_content.extend(
- ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
- )
- current_idx = index
- tagged_content.append(text[current_idx:])
+ tagged_content = ['\n', "\n"]
+ for idx, tags in sorted(tag_dict.items()):
+ tagged_content.append(text[current_idx:idx])
+ current_idx = idx
+ tagged_content.extend(tags)
+ tagged_content.append(text[current_idx:])
+ tagged_content.append("\n")
return "".join(tagged_content)
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
index 42d849e..1d31560 100644
--- a/src/inscriptis/html_engine.py
+++ b/src/inscriptis/html_engine.py
@@ -51,7 +51,9 @@ class Inscriptis:
text = parser.get_text()
"""
- def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
+ def __init__(
+ self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
+ ) -> None:
# use the default configuration, if no config object is provided
config = config or ParserConfig()
diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py
index 67b9050..699d3e2 100644
--- a/tests/test_annotation_engine.py
+++ b/tests/test_annotation_engine.py
@@ -11,9 +11,12 @@
def test_get_annotation():
"""Test get_anntation from the Inscriptis class"""
html = "Chur is a City in Switzerland"
- rules = {'b': ['bold']}
+ rules = {"b": ["bold"]}
inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
assert inscriptis.get_text() == "Chur is a City in Switzerland"
- assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
+ assert inscriptis.get_annotations() == [
+ Annotation(start=0, end=4, metadata="bold"),
+ Annotation(start=18, end=29, metadata="bold"),
+ ]
diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py
index 82fdc7a..2613566 100644
--- a/tests/test_annotation_output_processor.py
+++ b/tests/test_annotation_output_processor.py
@@ -15,7 +15,7 @@
"text": "Chur\n\nChur is the capital and largest town of "
"the Swiss canton of the Grisons and lies in the "
"Grisonian Rhine Valley.",
- "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
+ "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
}
@@ -36,8 +36,8 @@ def test_surface_annotator():
# and we have additional information on surface forms :)
assert result["surface"] == [
- ("heading", "Chur"),
("h1", "Chur"),
+ ("heading", "Chur"),
("emphasis", "Chur"),
]
@@ -48,11 +48,11 @@ def test_xml_annotator():
# and we have additional information on surface forms :)
assert result == (
- '\n'
- "Chur
\n\n"
+ '\n\n'
+ "Chur
\n\n"
"Chur is the capital and largest town "
"of the Swiss canton of the Grisons and lies in "
- "the Grisonian Rhine Valley."
+ "the Grisonian Rhine Valley.\n"
)
@@ -61,8 +61,8 @@ def test_html_annotator():
result = processor(EXAMPLE_OUTPUT)
assert result.startswith(""
+ assert result.split("")[1] == (
+ ""
'heading'
''
'h1'
@@ -81,6 +81,6 @@ def test_trailing_tag_annotation():
result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
assert result == (
- '\n'
- "Ehre sei Gott!"
+ '\n\n'
+ "Ehre sei Gott!\n"
)
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
new file mode 100644
index 0000000..cf3338a
--- /dev/null
+++ b/tests/test_annotation_output_xml.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+"""
+Test the annotation XmlExtractor.
+"""
+from lxml.html import fromstring
+
+from inscriptis import Inscriptis, ParserConfig
+from inscriptis.annotation.output.xml import XmlExtractor
+
+
+def test_tag_error_issue_93():
+ """
+ Test for the correct tag order in the XmlOutput as described in Issue #93.
+ """
+ html_issue_93 = """
+
+
+ Item1
+ Item2
+ Item3
+ Item4
+
+
+ """
+
+ expected_output_issue_93 = (
+ """\n\n"""
+ " Item1 Item2 Item3 "
+ "Item4\n"
+ )
+ rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
+
+ inscriptis = Inscriptis(
+ fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+ )
+ annotated_html = {
+ "text": inscriptis.get_text(),
+ "label": inscriptis.get_annotations(),
+ }
+ result = XmlExtractor()(annotated_html)
+ assert result == expected_output_issue_93
+
+
+def test_tag_folding_issue_93_extended():
+ html_issue_93 = """
+
+
+ Some Test to add :)
+ Item1
+ Item2
+ Item3
+ Item4
+
+
+ """
+
+ expected_output_issue_93 = (
+ """\n"""
+ """\n"""
+ """ Some Test to add :) Item 1 Item2 """
+ """Item3 It e m4\n"""
+ """"""
+ )
+ rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
+
+ inscriptis = Inscriptis(
+ fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+ )
+ annotated_html = {
+ "text": inscriptis.get_text(),
+ "label": inscriptis.get_annotations(),
+ }
+ result = XmlExtractor()(annotated_html)
+ assert result == expected_output_issue_93
diff --git a/tests/test_block.py b/tests/test_block.py
index 8aacc93..4ce3f7e 100644
--- a/tests/test_block.py
+++ b/tests/test_block.py
@@ -1,6 +1,7 @@
"""
Test cases for the Block class.
"""
+
from inscriptis.model.canvas.block import Block
from inscriptis.model.canvas.prefix import Prefix
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4e4cfc4..0c86198 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,6 +1,7 @@
"""
Tests the Inscriptis CLI client.
"""
+
from io import StringIO
from pathlib import Path
from json import loads
diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py
index d050e6a..342f54c 100644
--- a/tests/test_custom_html_tag_handling.py
+++ b/tests/test_custom_html_tag_handling.py
@@ -1,4 +1,5 @@
"""Test the custom HTML tag handling."""
+
from lxml.html import fromstring
from inscriptis import Inscriptis, ParserConfig