From d09574a736a4f0b55f8f81770da02fe86ade95eb Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 15:05:19 +0100
Subject: [PATCH 01/11] wip: lxml-based XML generation.

---
 src/inscriptis/annotation/output/xml.py | 66 +++++++++++++++++++++++-
 tests/test_annotation_output_xml.py     | 68 +++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_annotation_output_xml.py

diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
index c31aa06..a742854 100644
--- a/src/inscriptis/annotation/output/xml.py
+++ b/src/inscriptis/annotation/output/xml.py
@@ -1,7 +1,8 @@
 """XML Annotation processor."""
 from collections import defaultdict
-from typing import Dict, Any
+from typing import Dict, Any, Tuple
 
+from lxml import etree
 from inscriptis.annotation.output import AnnotationProcessor
 
 
@@ -10,7 +11,68 @@ class XmlExtractor(AnnotationProcessor):
 
     verbatim = True
 
-    def __call__(self, annotated_text: Dict[str, Any]) -> str:
+    def traverse_element(self, root, text, start, end, annotations, idx) -> int:
+        while idx + 1 < len(annotations):
+            idx += 1
+            next_start, next_end, label = annotations[idx]["label"]
+            # recurse?
+            if next_start < end:
+                leaf = etree.Element(root, label)
+                cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx)
+            else:
+                root.tail += text[start: cascaded_end]
+
+
+
+    def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str:
+        text = annotated_text["text"]
+        annotations = sorted(annotated_text["label"])
+        root = etree.Element(root_element)
+        current_annotation_idx = 0
+        while current_annotation_idx < len(annotations):
+            current_annotation_idx = self.traverse_element(root, text, annotations, idx)
+
+
+        for start, end, label in sorted(annotated_text["label"]):
+            current_element = etree.SubElement(root, label)
+            current_element.text = text[start:end]
+
+        return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
+
+    def call3(self, annotated_text: Dict[str, Any]) -> str:
+        tag_indices = defaultdict(list)
+
+        for start, end, label in sorted(annotated_text["label"]):
+            length = end - start
+            tag_indices[start].append((label, length))
+            tag_indices[end].append(("/" + label, length))
+
+        current_idx = 0
+        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
+        text = annotated_text["text"]
+        for index, tags in sorted(tag_indices.items()):
+            tagged_content.append(text[current_idx:index])
+
+            # Separate closing vs opening tags
+            closing_tags = [t for t in tags if t[0].startswith("/")]
+            opening_tags = [t for t in tags if not t[0].startswith("/")]
+
+            # Sort closing tags by ascending length (so outer closes last)
+            closing_tags.sort(key=lambda x: x[1])
+            for tag, _ in closing_tags:
+                tagged_content.append(f"<{tag}>")
+
+            # Sort opening tags by descending length (so outer opens first)
+            opening_tags.sort(key=lambda x: x[1], reverse=True)
+            for tag, _ in opening_tags:
+                tagged_content.append(f"<{tag}>")
+
+            current_idx = index
+        tagged_content.append(text[current_idx:])
+
+        return "".join(tagged_content)
+
+    def call2(self, annotated_text: Dict[str, Any]) -> str:
         """Provide an XML version of the given text and annotations.
 
         Args:
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
new file mode 100644
index 0000000..8bd0ac9
--- /dev/null
+++ b/tests/test_annotation_output_xml.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+Test the annotation XmlExtractor.
+"""
+from platform import processor
+from xml.etree.ElementTree import fromstring
+
+from inscriptis import Inscriptis, ParserConfig
+from inscriptis.annotation.output.xml import XmlExtractor
+
+
+
+def test_tag_error_issue_93():
+    """
+    Test for the correct tag order in the XmlOutput as described in Issue #93.
+    """
+    html_issue_93 = """<html>
+       <body>
+         <div class="a">
+            <span class="b">Item1</span>
+            <span class="b">Item2</span>
+            <span class="b">Item3</span>
+            <span class="b">Item4</span>
+         </div>
+       </body>
+    </html>"""
+
+    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
+                                "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+                                "<inner>Item4</inner></outer>")
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
+
+    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
+    annotated_html = {'text': inscriptis.get_text(),
+                      'label': inscriptis.get_annotations()}
+    print(">>>", annotated_html)
+
+    result = XmlExtractor()(annotated_html)
+    print(result)
+    assert result == expected_output_issue_93
+
+def test_tag_folding_issue_93_extended():
+    html_issue_93 = """<html>
+       <body>
+         <div class="a">
+         Some Test to add :)
+            <span class="b">Item1</span>
+            <span class="b">Item2</span>
+            <span class="b">Item3</span>
+            <span class="b">Item4</span>
+         </div>
+       </body>
+    </html>"""
+
+    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
+                                "<outer>  Some Test to add :) <inner>Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+                                "<inner>Item4</inner></outer>")
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
+
+    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
+    annotated_html = {'text': inscriptis.get_text(),
+                      'label': inscriptis.get_annotations()}
+    print(">>>", annotated_html)
+
+    result = XmlExtractor()(annotated_html)
+    print(result)
+    assert result == expected_output_issue_93

From 3654c4f33e8ef20e301ec4dbff18049afabadccc Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 17:23:12 +0100
Subject: [PATCH 02/11] fix: #93 - correct output XML

1. correct tag order
2. added a root tag with default name <content> to ensure that valid xml
   is created.
---
 pyproject.toml                          |   7 +-
 src/inscriptis/annotation/__init__.py   |   1 +
 src/inscriptis/annotation/output/xml.py | 103 +++---------------------
 src/inscriptis/html_engine.py           |   4 +-
 tests/test_annotation_output_xml.py     |  57 +++++++------
 5 files changed, 52 insertions(+), 120 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0edc368..c721628 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
 lxml = ">=4.9.3"
 
 # optional dependencies
-fastapi = { version = "^0.109.1", optional = true }
-uvicorn = { version = "^0.27.1", optional = true }
+fastapi = { version = "^0.115.11", optional = true }
+uvicorn = { version = "^0.34.0", optional = true }
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"
 
 
 [build-system]
diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py
index 94e5fe5..fada86e 100644
--- a/src/inscriptis/annotation/__init__.py
+++ b/src/inscriptis/annotation/__init__.py
@@ -1,5 +1,6 @@
 """The model used for saving annotations."""
 
+from functools import total_ordering
 from typing import List
 from typing import NamedTuple
 
diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
index a742854..1a925cc 100644
--- a/src/inscriptis/annotation/output/xml.py
+++ b/src/inscriptis/annotation/output/xml.py
@@ -1,8 +1,8 @@
 """XML Annotation processor."""
+
 from collections import defaultdict
-from typing import Dict, Any, Tuple
+from typing import Dict, Any
 
-from lxml import etree
 from inscriptis.annotation.output import AnnotationProcessor
 
 
@@ -11,101 +11,20 @@ class XmlExtractor(AnnotationProcessor):
 
     verbatim = True
 
-    def traverse_element(self, root, text, start, end, annotations, idx) -> int:
-        while idx + 1 < len(annotations):
-            idx += 1
-            next_start, next_end, label = annotations[idx]["label"]
-            # recurse?
-            if next_start < end:
-                leaf = etree.Element(root, label)
-                cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx)
-            else:
-                root.tail += text[start: cascaded_end]
-
-
-
-    def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str:
-        text = annotated_text["text"]
-        annotations = sorted(annotated_text["label"])
-        root = etree.Element(root_element)
-        current_annotation_idx = 0
-        while current_annotation_idx < len(annotations):
-            current_annotation_idx = self.traverse_element(root, text, annotations, idx)
-
-
-        for start, end, label in sorted(annotated_text["label"]):
-            current_element = etree.SubElement(root, label)
-            current_element.text = text[start:end]
-
-        return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
-
-    def call3(self, annotated_text: Dict[str, Any]) -> str:
-        tag_indices = defaultdict(list)
-
-        for start, end, label in sorted(annotated_text["label"]):
-            length = end - start
-            tag_indices[start].append((label, length))
-            tag_indices[end].append(("/" + label, length))
+    def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
+        tag_dict = defaultdict(list)
+        for start, end, tag in reversed(annotated_text["label"]):
+            tag_dict[start].append(f"<{tag}>")
+            tag_dict[end].insert(0, f"</{tag}>")
 
         current_idx = 0
-        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
         text = annotated_text["text"]
-        for index, tags in sorted(tag_indices.items()):
+        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
+        for index, tags in sorted(tag_dict.items()):
             tagged_content.append(text[current_idx:index])
-
-            # Separate closing vs opening tags
-            closing_tags = [t for t in tags if t[0].startswith("/")]
-            opening_tags = [t for t in tags if not t[0].startswith("/")]
-
-            # Sort closing tags by ascending length (so outer closes last)
-            closing_tags.sort(key=lambda x: x[1])
-            for tag, _ in closing_tags:
-                tagged_content.append(f"<{tag}>")
-
-            # Sort opening tags by descending length (so outer opens first)
-            opening_tags.sort(key=lambda x: x[1], reverse=True)
-            for tag, _ in opening_tags:
-                tagged_content.append(f"<{tag}>")
-
             current_idx = index
-        tagged_content.append(text[current_idx:])
-
-        return "".join(tagged_content)
-
-    def call2(self, annotated_text: Dict[str, Any]) -> str:
-        """Provide an XML version of the given text and annotations.
-
-        Args:
-            annotated_text: a dictionary containing the plain text and the
-                            extracted annotations.
-
-        Returns:
-            A string with the XML-version of the content.
-        """
-        tag_indices = defaultdict(list)
+            tagged_content.extend(tags)
 
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
-
-        current_idx = 0
-        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
-        text = annotated_text["text"]
-        for index, tags in sorted(tag_indices.items()):
-            tagged_content.append(text[current_idx:index])
-            # close tags
-            tagged_content.extend(
-                [
-                    "<" + tag + ">"
-                    for tag in sorted(tags, reverse=True)
-                    if tag.startswith("/")
-                ]
-            )
-            # open tags
-            tagged_content.extend(
-                ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
-            )
-            current_idx = index
         tagged_content.append(text[current_idx:])
-
+        tagged_content.append("\n</content>")
         return "".join(tagged_content)
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
index 42d849e..1d31560 100644
--- a/src/inscriptis/html_engine.py
+++ b/src/inscriptis/html_engine.py
@@ -51,7 +51,9 @@ class Inscriptis:
       text = parser.get_text()
     """
 
-    def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
+    def __init__(
+        self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
+    ) -> None:
         # use the default configuration, if no config object is provided
         config = config or ParserConfig()
 
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
index 8bd0ac9..cf3338a 100644
--- a/tests/test_annotation_output_xml.py
+++ b/tests/test_annotation_output_xml.py
@@ -3,14 +3,12 @@
 """
 Test the annotation XmlExtractor.
 """
-from platform import processor
-from xml.etree.ElementTree import fromstring
+from lxml.html import fromstring
 
 from inscriptis import Inscriptis, ParserConfig
 from inscriptis.annotation.output.xml import XmlExtractor
 
 
-
 def test_tag_error_issue_93():
     """
     Test for the correct tag order in the XmlOutput as described in Issue #93.
@@ -26,43 +24,52 @@ def test_tag_error_issue_93():
        </body>
     </html>"""
 
-    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
-                                "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
-                                "<inner>Item4</inner></outer>")
+    expected_output_issue_93 = (
+        """<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
+        "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+        "<inner>Item4</inner></outer>\n</content>"
+    )
     rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
 
-    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
-    annotated_html = {'text': inscriptis.get_text(),
-                      'label': inscriptis.get_annotations()}
-    print(">>>", annotated_html)
-
+    inscriptis = Inscriptis(
+        fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+    )
+    annotated_html = {
+        "text": inscriptis.get_text(),
+        "label": inscriptis.get_annotations(),
+    }
     result = XmlExtractor()(annotated_html)
-    print(result)
     assert result == expected_output_issue_93
 
+
 def test_tag_folding_issue_93_extended():
     html_issue_93 = """<html>
        <body>
          <div class="a">
          Some Test to add :)
-            <span class="b">Item1</span>
+            <span class="b">Item<b>1</b></span>
             <span class="b">Item2</span>
-            <span class="b">Item3</span>
-            <span class="b">Item4</span>
+            <span class="b"><b>Item3</b></span>
+            <span class="b"><b>It</b>e<b>m4</b></span>
          </div>
        </body>
     </html>"""
 
-    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
-                                "<outer>  Some Test to add :) <inner>Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
-                                "<inner>Item4</inner></outer>")
-    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
-
-    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
-    annotated_html = {'text': inscriptis.get_text(),
-                      'label': inscriptis.get_annotations()}
-    print(">>>", annotated_html)
+    expected_output_issue_93 = (
+        """<?xml version="1.0" encoding="UTF-8" ?>\n"""
+        """<content>\n"""
+        """<outer>  Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
+        """<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
+        """</content>"""
+    )
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
 
+    inscriptis = Inscriptis(
+        fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+    )
+    annotated_html = {
+        "text": inscriptis.get_text(),
+        "label": inscriptis.get_annotations(),
+    }
     result = XmlExtractor()(annotated_html)
-    print(result)
     assert result == expected_output_issue_93

From 5df0bb8d92eab4a70c979a7c669d5113c7088ff2 Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 17:38:55 +0100
Subject: [PATCH 03/11] fix: removed unnecessary import.

---
 src/inscriptis/annotation/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py
index fada86e..94e5fe5 100644
--- a/src/inscriptis/annotation/__init__.py
+++ b/src/inscriptis/annotation/__init__.py
@@ -1,6 +1,5 @@
 """The model used for saving annotations."""
 
-from functools import total_ordering
 from typing import List
 from typing import NamedTuple
 

From c8f675e32d40f36f8e661ee0a99d29b5d9b4a581 Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 17:39:38 +0100
Subject: [PATCH 04/11] fix: adapted XML-unittest to the improved output
 format.

---
 tests/test_annotation_output_processor.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py
index 82fdc7a..f03fd78 100644
--- a/tests/test_annotation_output_processor.py
+++ b/tests/test_annotation_output_processor.py
@@ -48,11 +48,11 @@ def test_xml_annotator():
 
     # and we have additional information on surface forms :)
     assert result == (
-        '<?xml version="1.0" encoding="UTF-8" ?>\n'
+        '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
         "<h1><heading>Chur</heading></h1>\n\n<emphasis>"
         "Chur</emphasis> is the capital and largest town "
         "of the Swiss canton of the Grisons and lies in "
-        "the Grisonian Rhine Valley."
+        "the Grisonian Rhine Valley.\n</content>"
     )
 
 
@@ -81,6 +81,6 @@ def test_trailing_tag_annotation():
     result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
 
     assert result == (
-        '<?xml version="1.0" encoding="UTF-8" ?>\n'
-        "Ehre sei <emphasis>Gott!</emphasis>"
+        '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
+        "Ehre sei <emphasis>Gott!</emphasis>\n</content>"
     )

From ff90b8482f023f6614c0d7d8bdef4cfbf5402b09 Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 17:53:32 +0100
Subject: [PATCH 05/11] chg: udpated documentation.

---
 README.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 361191a..4e13e7c 100644
--- a/README.rst
+++ b/README.rst
@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
 specified with the ``-p`` or ``--postprocessor`` command line argument::
 
   $ inscript https://www.fhgr.ch \
-          -r ./annotation/examples/annotation-profile.json \
+          -r ./examples/annotation/annotation-profile.json \
           -p surface
 
 
@@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
 - xml: returns an additional annotated text version::
 
     <?xml version="1.0" encoding="UTF-8" ?>
+    <content>
     <heading>Chur</heading>
 
     <emphasis>Chur</emphasis> is the capital and largest town of the Swiss
     canton of the Grisons and lies in the Grisonian Rhine Valley.
+    </content>
 
 - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
 

From 7ab5da549a8ba817d65188f79f1117fc5662adbc Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 17:56:02 +0100
Subject: [PATCH 06/11] fix: Wikipedia URL in example.

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 4e13e7c..2add60b 100644
--- a/README.rst
+++ b/README.rst
@@ -284,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:
 
       inscript --annotation-rules ./wikipedia.json \
                   --postprocessor html \
-                  https://en.wikipedia.org/wiki/Chur.html
+                  https://en.wikipedia.org/wiki/Chur
 
    Annotation rules encoded in the ``wikipedia.json`` file:
 

From 6733dd5728851c8710d3f006678586a9caef2e9f Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 19:09:23 +0100
Subject: [PATCH 07/11] fix: fixed missing bracket in documentation.

---
 src/inscriptis/annotation/output/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py
index 41a7fb2..089d1a3 100644
--- a/src/inscriptis/annotation/output/__init__.py
+++ b/src/inscriptis/annotation/output/__init__.py
@@ -10,9 +10,9 @@
     2. The overwritten :meth:`__call__` method may either extend the original
        dictionary which contains the extracted text and annotations (e.g.,
        :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
-       may replace it with an custom output (e.g.,
+       may replace it with a custom output (e.g.,
        :class:`~inscriptis.annotation.output.html.HtmlExtractor` and
-       :class:`~inscriptis.annotation.output.xml.XmlExtractor`.
+       :class:`~inscriptis.annotation.output.xml.XmlExtractor`).
 
 Currently, Inscriptis supports the following built-in AnnotationProcessors:
 
@@ -25,6 +25,7 @@
     of the extracted annotations.
 
 """
+
 from typing import Dict, Any
 
 

From 76ad054febbfd5b6f5896131d58bbadd318f146b Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 19:10:23 +0100
Subject: [PATCH 08/11] chg: backported improved logic from XmlExtractor.

---
 src/inscriptis/annotation/output/html.py | 44 ++++++++----------------
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py
index f7da4a8..8ee5c4a 100644
--- a/src/inscriptis/annotation/output/html.py
+++ b/src/inscriptis/annotation/output/html.py
@@ -1,4 +1,5 @@
 """HTML Annotation Processor."""
+
 from collections import defaultdict
 from itertools import cycle
 from typing import Dict, Any, List
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
     verbatim = True
 
     def __call__(self, annotated_text: Dict[str, Any]) -> str:
-        tag_indices = defaultdict(list)
+        tag_dict = defaultdict(list)
 
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
+        for start, end, label in reversed(annotated_text["label"]):
+            tag_dict[start].append(
+                f'<span class="{label}-label">{label}</span><span class="{label}">'
+            )
+            tag_dict[end].insert(0, "</span>")
 
-        open_tags = []
         tagged_content = [
             "<html><head><style>",
             self._get_css(annotated_text["label"]),
             "</style></head><body><pre>",
         ]
-        for idx, ch in enumerate(annotated_text["text"]):
-            if idx in tag_indices:
-                tags = tag_indices[idx]
-                # close tags:
-                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
-                    open_tags.pop()
-                    tagged_content.append("</span>")
-                # open tags
-                for tag in (
-                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
-                ):
-                    open_tags.append(tag)
-                    tagged_content.append(
-                        '<span class="{tag}-label">{tag}</span>'
-                        '<span class="{tag}">'.format(tag=tag)
-                    )
-
-            if ch == "\n":
-                tagged_content.extend(["</span>" for _ in open_tags])
-                tagged_content.append("</pre>\n<pre>")
-                tagged_content.extend(
-                    ['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
-                )
-            else:
-                tagged_content.append(ch)
 
+        text = annotated_text["text"]
+        current_idx = 0
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
         return "".join(tagged_content) + "</pre></body></html>"
 
     @staticmethod

From c1e05800239d74d09cc42f99d2f5e970276672c6 Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 19:10:48 +0100
Subject: [PATCH 09/11] chg: minor style changes.

---
 src/inscriptis/annotation/output/xml.py   |  6 +++---
 tests/test_annotation_output_processor.py | 13 ++++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
index 1a925cc..a791f8c 100644
--- a/src/inscriptis/annotation/output/xml.py
+++ b/src/inscriptis/annotation/output/xml.py
@@ -20,9 +20,9 @@ def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
         current_idx = 0
         text = annotated_text["text"]
         tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
-        for index, tags in sorted(tag_dict.items()):
-            tagged_content.append(text[current_idx:index])
-            current_idx = index
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx])
+            current_idx = idx
             tagged_content.extend(tags)
 
         tagged_content.append(text[current_idx:])
diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py
index f03fd78..b5a6e61 100644
--- a/tests/test_annotation_output_processor.py
+++ b/tests/test_annotation_output_processor.py
@@ -15,7 +15,7 @@
     "text": "Chur\n\nChur is the capital and largest town of "
     "the Swiss canton of the Grisons and lies in the "
     "Grisonian Rhine Valley.",
-    "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
+    "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
 }
 
 
@@ -36,8 +36,8 @@ def test_surface_annotator():
 
     # and we have additional information on surface forms :)
     assert result["surface"] == [
-        ("heading", "Chur"),
         ("h1", "Chur"),
+        ("heading", "Chur"),
         ("emphasis", "Chur"),
     ]
 
@@ -49,7 +49,7 @@ def test_xml_annotator():
     # and we have additional information on surface forms :)
     assert result == (
         '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
-        "<h1><heading>Chur</heading></h1>\n\n<emphasis>"
+        "<heading><h1>Chur</h1></heading>\n\n<emphasis>"
         "Chur</emphasis> is the capital and largest town "
         "of the Swiss canton of the Grisons and lies in "
         "the Grisonian Rhine Valley.\n</content>"
@@ -60,9 +60,9 @@ def test_html_annotator():
     processor = HtmlExtractor()
     result = processor(EXAMPLE_OUTPUT)
 
+
     assert result.startswith("<html><head><style>")
-    assert result.endswith(
-        "</style></head>"
+    assert result.split("</style>")[1] ==  ("</head>"
         '<body><pre><span class="heading-label">heading'
         '</span><span class="heading">'
         '<span class="h1-label">h1</span><span class="h1">'
@@ -72,8 +72,7 @@ def test_html_annotator():
         '<span class="emphasis">Chur</span> is the capital '
         "and largest town of the Swiss canton of the "
         "Grisons and lies in the Grisonian Rhine Valley."
-        "</pre></body></html>"
-    )
+        "</pre></body></html>")
 
 
 def test_trailing_tag_annotation():

From 8f01fa17d8f05cdd096d4434e54d3ca3d9673a5b Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 19:13:38 +0100
Subject: [PATCH 10/11] chg: upped version.

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c721628..f60795f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "inscriptis"
-version = "2.5.3"
+version = "2.6.0"
 authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
 description = "inscriptis - HTML to text converter."
 keywords = ["HTML", "converter", "text"]

From 6c89cc7b721db8b9f6e35129b39f68b49359fc68 Mon Sep 17 00:00:00 2001
From: Albert Weichselbraun <albert@weichselbraun.net>
Date: Sat, 22 Mar 2025 19:20:22 +0100
Subject: [PATCH 11/11] fix: formatting.

---
 tests/test_annotation_engine.py           | 7 +++++--
 tests/test_annotation_output_processor.py | 7 ++++---
 tests/test_block.py                       | 1 +
 tests/test_cli.py                         | 1 +
 tests/test_custom_html_tag_handling.py    | 1 +
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py
index 67b9050..699d3e2 100644
--- a/tests/test_annotation_engine.py
+++ b/tests/test_annotation_engine.py
@@ -11,9 +11,12 @@
 def test_get_annotation():
     """Test get_anntation from the Inscriptis class"""
     html = "<b>Chur</b> is a City in <b>Switzerland</b>"
-    rules = {'b': ['bold']}
+    rules = {"b": ["bold"]}
 
     inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
 
     assert inscriptis.get_text() == "Chur is a City in Switzerland"
-    assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
+    assert inscriptis.get_annotations() == [
+        Annotation(start=0, end=4, metadata="bold"),
+        Annotation(start=18, end=29, metadata="bold"),
+    ]
diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py
index b5a6e61..2613566 100644
--- a/tests/test_annotation_output_processor.py
+++ b/tests/test_annotation_output_processor.py
@@ -60,9 +60,9 @@ def test_html_annotator():
     processor = HtmlExtractor()
     result = processor(EXAMPLE_OUTPUT)
 
-
     assert result.startswith("<html><head><style>")
-    assert result.split("</style>")[1] ==  ("</head>"
+    assert result.split("</style>")[1] == (
+        "</head>"
         '<body><pre><span class="heading-label">heading'
         '</span><span class="heading">'
         '<span class="h1-label">h1</span><span class="h1">'
@@ -72,7 +72,8 @@ def test_html_annotator():
         '<span class="emphasis">Chur</span> is the capital '
         "and largest town of the Swiss canton of the "
         "Grisons and lies in the Grisonian Rhine Valley."
-        "</pre></body></html>")
+        "</pre></body></html>"
+    )
 
 
 def test_trailing_tag_annotation():
diff --git a/tests/test_block.py b/tests/test_block.py
index 8aacc93..4ce3f7e 100644
--- a/tests/test_block.py
+++ b/tests/test_block.py
@@ -1,6 +1,7 @@
 """
 Test cases for the Block class.
 """
+
 from inscriptis.model.canvas.block import Block
 from inscriptis.model.canvas.prefix import Prefix
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4e4cfc4..0c86198 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,6 +1,7 @@
 """
 Tests the Inscriptis CLI client.
 """
+
 from io import StringIO
 from pathlib import Path
 from json import loads
diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py
index d050e6a..342f54c 100644
--- a/tests/test_custom_html_tag_handling.py
+++ b/tests/test_custom_html_tag_handling.py
@@ -1,4 +1,5 @@
 """Test the custom HTML tag handling."""
+
 from lxml.html import fromstring
 
 from inscriptis import Inscriptis, ParserConfig