From d09574a736a4f0b55f8f81770da02fe86ade95eb Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 15:05:19 +0100 Subject: [PATCH 01/11] wip: lxml-based XML generation. --- src/inscriptis/annotation/output/xml.py | 66 +++++++++++++++++++++++- tests/test_annotation_output_xml.py | 68 +++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 tests/test_annotation_output_xml.py diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py index c31aa06..a742854 100644 --- a/src/inscriptis/annotation/output/xml.py +++ b/src/inscriptis/annotation/output/xml.py @@ -1,7 +1,8 @@ """XML Annotation processor.""" from collections import defaultdict -from typing import Dict, Any +from typing import Dict, Any, Tuple +from lxml import etree from inscriptis.annotation.output import AnnotationProcessor @@ -10,7 +11,68 @@ class XmlExtractor(AnnotationProcessor): verbatim = True - def __call__(self, annotated_text: Dict[str, Any]) -> str: + def traverse_element(self, root, text, start, end, annotations, idx) -> int: + while idx + 1 < len(annotations): + idx += 1 + next_start, next_end, label = annotations[idx]["label"] + # recurse? + if next_start < end: + leaf = etree.Element(root, label) + cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx) + else: + root.tail += text[start: cascaded_end] + + + + def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str: + text = annotated_text["text"] + annotations = sorted(annotated_text["label"]) + root = etree.Element(root_element) + current_annotation_idx = 0 + while current_annotation_idx < len(annotations): + current_annotation_idx = self.traverse_element(root, text, annotations, idx) + + + for start, end, label in sorted(annotated_text["label"]): + current_element = etree.SubElement(root, label) + current_element.text = text[start:end] + + return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8") + + def call3(self, annotated_text: Dict[str, Any]) -> str: + tag_indices = defaultdict(list) + + for start, end, label in sorted(annotated_text["label"]): + length = end - start + tag_indices[start].append((label, length)) + tag_indices[end].append(("/" + label, length)) + + current_idx = 0 + tagged_content = ['\n'] + text = annotated_text["text"] + for index, tags in sorted(tag_indices.items()): + tagged_content.append(text[current_idx:index]) + + # Separate closing vs opening tags + closing_tags = [t for t in tags if t[0].startswith("/")] + opening_tags = [t for t in tags if not t[0].startswith("/")] + + # Sort closing tags by ascending length (so outer closes last) + closing_tags.sort(key=lambda x: x[1]) + for tag, _ in closing_tags: + tagged_content.append(f"<{tag}>") + + # Sort opening tags by descending length (so outer opens first) + opening_tags.sort(key=lambda x: x[1], reverse=True) + for tag, _ in opening_tags: + tagged_content.append(f"<{tag}>") + + current_idx = index + tagged_content.append(text[current_idx:]) + + return "".join(tagged_content) + + def call2(self, annotated_text: Dict[str, Any]) -> str: """Provide an XML version of the given text and annotations. Args: diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py new file mode 100644 index 0000000..8bd0ac9 --- /dev/null +++ b/tests/test_annotation_output_xml.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +""" +Test the annotation XmlExtractor. +""" +from platform import processor +from xml.etree.ElementTree import fromstring + +from inscriptis import Inscriptis, ParserConfig +from inscriptis.annotation.output.xml import XmlExtractor + + + +def test_tag_error_issue_93(): + """ + Test for the correct tag order in the XmlOutput as described in Issue #93. + """ + html_issue_93 = """ + +
+ Item1 + Item2 + Item3 + Item4 +
+ + """ + + expected_output_issue_93 = ("""\n""" + " Item1 Item2 Item3 " + "Item4") + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} + + inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules)) + annotated_html = {'text': inscriptis.get_text(), + 'label': inscriptis.get_annotations()} + print(">>>", annotated_html) + + result = XmlExtractor()(annotated_html) + print(result) + assert result == expected_output_issue_93 + +def test_tag_folding_issue_93_extended(): + html_issue_93 = """ + +
+ Some Test to add :) + Item1 + Item2 + Item3 + Item4 +
+ + """ + + expected_output_issue_93 = ("""\n""" + " Some Test to add :) Item1 Item2 Item3 " + "Item4") + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} + + inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules)) + annotated_html = {'text': inscriptis.get_text(), + 'label': inscriptis.get_annotations()} + print(">>>", annotated_html) + + result = XmlExtractor()(annotated_html) + print(result) + assert result == expected_output_issue_93 From 3654c4f33e8ef20e301ec4dbff18049afabadccc Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 17:23:12 +0100 Subject: [PATCH 02/11] fix: #93 - correct output XML 1. correct tag order 2. added a root tag with default name to ensure that valid xml is created. --- pyproject.toml | 7 +- src/inscriptis/annotation/__init__.py | 1 + src/inscriptis/annotation/output/xml.py | 103 +++--------------------- src/inscriptis/html_engine.py | 4 +- tests/test_annotation_output_xml.py | 57 +++++++------ 5 files changed, 52 insertions(+), 120 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0edc368..c721628 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,8 +44,11 @@ requests = ">=2.32.2" lxml = ">=4.9.3" # optional dependencies -fastapi = { version = "^0.109.1", optional = true } -uvicorn = { version = "^0.27.1", optional = true } +fastapi = { version = "^0.115.11", optional = true } +uvicorn = { version = "^0.34.0", optional = true } + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.5" [build-system] diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index 94e5fe5..fada86e 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -1,5 +1,6 @@ """The model used for saving annotations.""" +from functools import total_ordering from typing import List from typing import NamedTuple diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py index a742854..1a925cc 100644 --- a/src/inscriptis/annotation/output/xml.py +++ b/src/inscriptis/annotation/output/xml.py @@ -1,8 +1,8 @@ """XML Annotation processor.""" + from collections import defaultdict -from typing import Dict, Any, Tuple +from typing import Dict, Any -from lxml import etree from inscriptis.annotation.output import AnnotationProcessor @@ -11,101 +11,20 @@ class XmlExtractor(AnnotationProcessor): verbatim = True - def traverse_element(self, root, text, start, end, annotations, idx) -> int: - while idx + 1 < len(annotations): - idx += 1 - next_start, next_end, label = annotations[idx]["label"] - # recurse? - if next_start < end: - leaf = etree.Element(root, label) - cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx) - else: - root.tail += text[start: cascaded_end] - - - - def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str: - text = annotated_text["text"] - annotations = sorted(annotated_text["label"]) - root = etree.Element(root_element) - current_annotation_idx = 0 - while current_annotation_idx < len(annotations): - current_annotation_idx = self.traverse_element(root, text, annotations, idx) - - - for start, end, label in sorted(annotated_text["label"]): - current_element = etree.SubElement(root, label) - current_element.text = text[start:end] - - return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8") - - def call3(self, annotated_text: Dict[str, Any]) -> str: - tag_indices = defaultdict(list) - - for start, end, label in sorted(annotated_text["label"]): - length = end - start - tag_indices[start].append((label, length)) - tag_indices[end].append(("/" + label, length)) + def __call__(self, annotated_text: Dict[str, Any], root_element="content"): + tag_dict = defaultdict(list) + for start, end, tag in reversed(annotated_text["label"]): + tag_dict[start].append(f"<{tag}>") + tag_dict[end].insert(0, f"") current_idx = 0 - tagged_content = ['\n'] text = annotated_text["text"] - for index, tags in sorted(tag_indices.items()): + tagged_content = ['\n', "\n"] + for index, tags in sorted(tag_dict.items()): tagged_content.append(text[current_idx:index]) - - # Separate closing vs opening tags - closing_tags = [t for t in tags if t[0].startswith("/")] - opening_tags = [t for t in tags if not t[0].startswith("/")] - - # Sort closing tags by ascending length (so outer closes last) - closing_tags.sort(key=lambda x: x[1]) - for tag, _ in closing_tags: - tagged_content.append(f"<{tag}>") - - # Sort opening tags by descending length (so outer opens first) - opening_tags.sort(key=lambda x: x[1], reverse=True) - for tag, _ in opening_tags: - tagged_content.append(f"<{tag}>") - current_idx = index - tagged_content.append(text[current_idx:]) - - return "".join(tagged_content) - - def call2(self, annotated_text: Dict[str, Any]) -> str: - """Provide an XML version of the given text and annotations. - - Args: - annotated_text: a dictionary containing the plain text and the - extracted annotations. - - Returns: - A string with the XML-version of the content. - """ - tag_indices = defaultdict(list) + tagged_content.extend(tags) - for start, end, label in sorted(annotated_text["label"]): - tag_indices[start].append(label) - tag_indices[end].append("/" + label) - - current_idx = 0 - tagged_content = ['\n'] - text = annotated_text["text"] - for index, tags in sorted(tag_indices.items()): - tagged_content.append(text[current_idx:index]) - # close tags - tagged_content.extend( - [ - "<" + tag + ">" - for tag in sorted(tags, reverse=True) - if tag.startswith("/") - ] - ) - # open tags - tagged_content.extend( - ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")] - ) - current_idx = index tagged_content.append(text[current_idx:]) - + tagged_content.append("\n") return "".join(tagged_content) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 42d849e..1d31560 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -51,7 +51,9 @@ class Inscriptis: text = parser.get_text() """ - def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None): + def __init__( + self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None + ) -> None: # use the default configuration, if no config object is provided config = config or ParserConfig() diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py index 8bd0ac9..cf3338a 100644 --- a/tests/test_annotation_output_xml.py +++ b/tests/test_annotation_output_xml.py @@ -3,14 +3,12 @@ """ Test the annotation XmlExtractor. """ -from platform import processor -from xml.etree.ElementTree import fromstring +from lxml.html import fromstring from inscriptis import Inscriptis, ParserConfig from inscriptis.annotation.output.xml import XmlExtractor - def test_tag_error_issue_93(): """ Test for the correct tag order in the XmlOutput as described in Issue #93. @@ -26,43 +24,52 @@ def test_tag_error_issue_93(): """ - expected_output_issue_93 = ("""\n""" - " Item1 Item2 Item3 " - "Item4") + expected_output_issue_93 = ( + """\n\n""" + " Item1 Item2 Item3 " + "Item4\n" + ) rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} - inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules)) - annotated_html = {'text': inscriptis.get_text(), - 'label': inscriptis.get_annotations()} - print(">>>", annotated_html) - + inscriptis = Inscriptis( + fromstring(html_issue_93), ParserConfig(annotation_rules=rules) + ) + annotated_html = { + "text": inscriptis.get_text(), + "label": inscriptis.get_annotations(), + } result = XmlExtractor()(annotated_html) - print(result) assert result == expected_output_issue_93 + def test_tag_folding_issue_93_extended(): html_issue_93 = """
Some Test to add :) - Item1 + Item1 Item2 - Item3 - Item4 + Item3 + Item4
""" - expected_output_issue_93 = ("""\n""" - " Some Test to add :) Item1 Item2 Item3 " - "Item4") - rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} - - inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules)) - annotated_html = {'text': inscriptis.get_text(), - 'label': inscriptis.get_annotations()} - print(">>>", annotated_html) + expected_output_issue_93 = ( + """\n""" + """\n""" + """ Some Test to add :) Item 1 Item2 """ + """Item3 It e m4\n""" + """""" + ) + rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]} + inscriptis = Inscriptis( + fromstring(html_issue_93), ParserConfig(annotation_rules=rules) + ) + annotated_html = { + "text": inscriptis.get_text(), + "label": inscriptis.get_annotations(), + } result = XmlExtractor()(annotated_html) - print(result) assert result == expected_output_issue_93 From 5df0bb8d92eab4a70c979a7c669d5113c7088ff2 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 17:38:55 +0100 Subject: [PATCH 03/11] fix: removed unnecessary import. --- src/inscriptis/annotation/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index fada86e..94e5fe5 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -1,6 +1,5 @@ """The model used for saving annotations.""" -from functools import total_ordering from typing import List from typing import NamedTuple From c8f675e32d40f36f8e661ee0a99d29b5d9b4a581 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 17:39:38 +0100 Subject: [PATCH 04/11] fix: adapted XML-unittest to the improved output format. --- tests/test_annotation_output_processor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py index 82fdc7a..f03fd78 100644 --- a/tests/test_annotation_output_processor.py +++ b/tests/test_annotation_output_processor.py @@ -48,11 +48,11 @@ def test_xml_annotator(): # and we have additional information on surface forms :) assert result == ( - '\n' + '\n\n' "

Chur

\n\n" "Chur is the capital and largest town " "of the Swiss canton of the Grisons and lies in " - "the Grisonian Rhine Valley." + "the Grisonian Rhine Valley.\n
" ) @@ -81,6 +81,6 @@ def test_trailing_tag_annotation(): result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]}) assert result == ( - '\n' - "Ehre sei Gott!" + '\n\n' + "Ehre sei Gott!\n" ) From ff90b8482f023f6614c0d7d8bdef4cfbf5402b09 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 17:53:32 +0100 Subject: [PATCH 05/11] chg: udpated documentation. --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 361191a..4e13e7c 100644 --- a/README.rst +++ b/README.rst @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be specified with the ``-p`` or ``--postprocessor`` command line argument:: $ inscript https://www.fhgr.ch \ - -r ./annotation/examples/annotation-profile.json \ + -r ./examples/annotation/annotation-profile.json \ -p surface @@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors: - xml: returns an additional annotated text version:: + Chur Chur is the capital and largest town of the Swiss canton of the Grisons and lies in the Grisonian Rhine Valley. + - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below: From 7ab5da549a8ba817d65188f79f1117fc5662adbc Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 17:56:02 +0100 Subject: [PATCH 06/11] fix: Wikipedia URL in example. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 4e13e7c..2add60b 100644 --- a/README.rst +++ b/README.rst @@ -284,7 +284,7 @@ Currently, inscriptis supports the following postprocessors: inscript --annotation-rules ./wikipedia.json \ --postprocessor html \ - https://en.wikipedia.org/wiki/Chur.html + https://en.wikipedia.org/wiki/Chur Annotation rules encoded in the ``wikipedia.json`` file: From 6733dd5728851c8710d3f006678586a9caef2e9f Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 19:09:23 +0100 Subject: [PATCH 07/11] fix: fixed missing bracket in documentation. --- src/inscriptis/annotation/output/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py index 41a7fb2..089d1a3 100644 --- a/src/inscriptis/annotation/output/__init__.py +++ b/src/inscriptis/annotation/output/__init__.py @@ -10,9 +10,9 @@ 2. The overwritten :meth:`__call__` method may either extend the original dictionary which contains the extracted text and annotations (e.g., :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or - may replace it with an custom output (e.g., + may replace it with a custom output (e.g., :class:`~inscriptis.annotation.output.html.HtmlExtractor` and - :class:`~inscriptis.annotation.output.xml.XmlExtractor`. + :class:`~inscriptis.annotation.output.xml.XmlExtractor`). Currently, Inscriptis supports the following built-in AnnotationProcessors: @@ -25,6 +25,7 @@ of the extracted annotations. """ + from typing import Dict, Any From 76ad054febbfd5b6f5896131d58bbadd318f146b Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 19:10:23 +0100 Subject: [PATCH 08/11] chg: backported improved logic from XmlExtractor. --- src/inscriptis/annotation/output/html.py | 44 ++++++++---------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index f7da4a8..8ee5c4a 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -1,4 +1,5 @@ """HTML Annotation Processor.""" + from collections import defaultdict from itertools import cycle from typing import Dict, Any, List @@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor): verbatim = True def __call__(self, annotated_text: Dict[str, Any]) -> str: - tag_indices = defaultdict(list) + tag_dict = defaultdict(list) - for start, end, label in sorted(annotated_text["label"]): - tag_indices[start].append(label) - tag_indices[end].append("/" + label) + for start, end, label in reversed(annotated_text["label"]): + tag_dict[start].append( + f'{label}' + ) + tag_dict[end].insert(0, "") - open_tags = [] tagged_content = [ "
",
         ]
-        for idx, ch in enumerate(annotated_text["text"]):
-            if idx in tag_indices:
-                tags = tag_indices[idx]
-                # close tags:
-                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
-                    open_tags.pop()
-                    tagged_content.append("")
-                # open tags
-                for tag in (
-                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
-                ):
-                    open_tags.append(tag)
-                    tagged_content.append(
-                        '{tag}'
-                        ''.format(tag=tag)
-                    )
-
-            if ch == "\n":
-                tagged_content.extend(["" for _ in open_tags])
-                tagged_content.append("
\n
")
-                tagged_content.extend(
-                    [''.format(tag=tag) for tag in open_tags]
-                )
-            else:
-                tagged_content.append(ch)
 
+        text = annotated_text["text"]
+        current_idx = 0
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx].replace("\n", "
\n
"))
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:].replace("\n", "
\n")) return "".join(tagged_content) + "" @staticmethod From c1e05800239d74d09cc42f99d2f5e970276672c6 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 19:10:48 +0100 Subject: [PATCH 09/11] chg: minor style changes. --- src/inscriptis/annotation/output/xml.py | 6 +++--- tests/test_annotation_output_processor.py | 13 ++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py index 1a925cc..a791f8c 100644 --- a/src/inscriptis/annotation/output/xml.py +++ b/src/inscriptis/annotation/output/xml.py @@ -20,9 +20,9 @@ def __call__(self, annotated_text: Dict[str, Any], root_element="content"): current_idx = 0 text = annotated_text["text"] tagged_content = ['\n', "\n"] - for index, tags in sorted(tag_dict.items()): - tagged_content.append(text[current_idx:index]) - current_idx = index + for idx, tags in sorted(tag_dict.items()): + tagged_content.append(text[current_idx:idx]) + current_idx = idx tagged_content.extend(tags) tagged_content.append(text[current_idx:]) diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py index f03fd78..b5a6e61 100644 --- a/tests/test_annotation_output_processor.py +++ b/tests/test_annotation_output_processor.py @@ -15,7 +15,7 @@ "text": "Chur\n\nChur is the capital and largest town of " "the Swiss canton of the Grisons and lies in the " "Grisonian Rhine Valley.", - "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]], + "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]], } @@ -36,8 +36,8 @@ def test_surface_annotator(): # and we have additional information on surface forms :) assert result["surface"] == [ - ("heading", "Chur"), ("h1", "Chur"), + ("heading", "Chur"), ("emphasis", "Chur"), ] @@ -49,7 +49,7 @@ def test_xml_annotator(): # and we have additional information on surface forms :) assert result == ( '\n\n' - "

Chur

\n\n" + "

Chur

\n\n" "Chur is the capital and largest town " "of the Swiss canton of the Grisons and lies in " "the Grisonian Rhine Valley.\n
" @@ -60,9 +60,9 @@ def test_html_annotator(): processor = HtmlExtractor() result = processor(EXAMPLE_OUTPUT) + assert result.startswith("" + assert result.split("")[1] == ("" '
heading'
         ''
         'h1'
@@ -72,8 +72,7 @@ def test_html_annotator():
         'Chur is the capital '
         "and largest town of the Swiss canton of the "
         "Grisons and lies in the Grisonian Rhine Valley."
-        "
" - ) + "") def test_trailing_tag_annotation(): From 8f01fa17d8f05cdd096d4434e54d3ca3d9673a5b Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 19:13:38 +0100 Subject: [PATCH 10/11] chg: upped version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c721628..f60795f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "inscriptis" -version = "2.5.3" +version = "2.6.0" authors = ["Albert Weichselbraun ", "Fabian Odoni "] description = "inscriptis - HTML to text converter." keywords = ["HTML", "converter", "text"] From 6c89cc7b721db8b9f6e35129b39f68b49359fc68 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 22 Mar 2025 19:20:22 +0100 Subject: [PATCH 11/11] fix: formatting. --- tests/test_annotation_engine.py | 7 +++++-- tests/test_annotation_output_processor.py | 7 ++++--- tests/test_block.py | 1 + tests/test_cli.py | 1 + tests/test_custom_html_tag_handling.py | 1 + 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py index 67b9050..699d3e2 100644 --- a/tests/test_annotation_engine.py +++ b/tests/test_annotation_engine.py @@ -11,9 +11,12 @@ def test_get_annotation(): """Test get_anntation from the Inscriptis class""" html = "Chur is a City in Switzerland" - rules = {'b': ['bold']} + rules = {"b": ["bold"]} inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules)) assert inscriptis.get_text() == "Chur is a City in Switzerland" - assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')] + assert inscriptis.get_annotations() == [ + Annotation(start=0, end=4, metadata="bold"), + Annotation(start=18, end=29, metadata="bold"), + ] diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py index b5a6e61..2613566 100644 --- a/tests/test_annotation_output_processor.py +++ b/tests/test_annotation_output_processor.py @@ -60,9 +60,9 @@ def test_html_annotator(): processor = HtmlExtractor() result = processor(EXAMPLE_OUTPUT) - assert result.startswith("")[1] == ("" + assert result.split("")[1] == ( + "" '
heading'
         ''
         'h1'
@@ -72,7 +72,8 @@ def test_html_annotator():
         'Chur is the capital '
         "and largest town of the Swiss canton of the "
         "Grisons and lies in the Grisonian Rhine Valley."
-        "
") + "" + ) def test_trailing_tag_annotation(): diff --git a/tests/test_block.py b/tests/test_block.py index 8aacc93..4ce3f7e 100644 --- a/tests/test_block.py +++ b/tests/test_block.py @@ -1,6 +1,7 @@ """ Test cases for the Block class. """ + from inscriptis.model.canvas.block import Block from inscriptis.model.canvas.prefix import Prefix diff --git a/tests/test_cli.py b/tests/test_cli.py index 4e4cfc4..0c86198 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,7 @@ """ Tests the Inscriptis CLI client. """ + from io import StringIO from pathlib import Path from json import loads diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py index d050e6a..342f54c 100644 --- a/tests/test_custom_html_tag_handling.py +++ b/tests/test_custom_html_tag_handling.py @@ -1,4 +1,5 @@ """Test the custom HTML tag handling.""" + from lxml.html import fromstring from inscriptis import Inscriptis, ParserConfig