11"""XML Annotation processor."""
2+
23from collections import defaultdict
3- from typing import Dict , Any , Tuple
4+ from typing import Dict , Any
45
5- from lxml import etree
66from inscriptis .annotation .output import AnnotationProcessor
77
88
@@ -11,101 +11,20 @@ class XmlExtractor(AnnotationProcessor):
1111
1212 verbatim = True
1313
14- def traverse_element (self , root , text , start , end , annotations , idx ) -> int :
15- while idx + 1 < len (annotations ):
16- idx += 1
17- next_start , next_end , label = annotations [idx ]["label" ]
18- # recurse?
19- if next_start < end :
20- leaf = etree .Element (root , label )
21- cascaded_end = self .traverse_element (leaf , text , next_start , next_end , idx )
22- else :
23- root .tail += text [start : cascaded_end ]
24-
25-
26-
27- def __call__ (self , annotated_text : Dict [str , Any ], root_element = 'r' ) -> str :
28- text = annotated_text ["text" ]
29- annotations = sorted (annotated_text ["label" ])
30- root = etree .Element (root_element )
31- current_annotation_idx = 0
32- while current_annotation_idx < len (annotations ):
33- current_annotation_idx = self .traverse_element (root , text , annotations , idx )
34-
35-
36- for start , end , label in sorted (annotated_text ["label" ]):
37- current_element = etree .SubElement (root , label )
38- current_element .text = text [start :end ]
39-
40- return etree .tostring (root , pretty_print = True , xml_declaration = True , encoding = "UTF-8" )
41-
42- def call3 (self , annotated_text : Dict [str , Any ]) -> str :
43- tag_indices = defaultdict (list )
44-
45- for start , end , label in sorted (annotated_text ["label" ]):
46- length = end - start
47- tag_indices [start ].append ((label , length ))
48- tag_indices [end ].append (("/" + label , length ))
14+ def __call__ (self , annotated_text : Dict [str , Any ], root_element = "content" ):
15+ tag_dict = defaultdict (list )
16+ for start , end , tag in reversed (annotated_text ["label" ]):
17+ tag_dict [start ].append (f"<{ tag } >" )
18+ tag_dict [end ].insert (0 , f"</{ tag } >" )
4919
5020 current_idx = 0
51- tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' ]
5221 text = annotated_text ["text" ]
53- for index , tags in sorted (tag_indices .items ()):
22+ tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' , "<content>\n " ]
23+ for index , tags in sorted (tag_dict .items ()):
5424 tagged_content .append (text [current_idx :index ])
55-
56- # Separate closing vs opening tags
57- closing_tags = [t for t in tags if t [0 ].startswith ("/" )]
58- opening_tags = [t for t in tags if not t [0 ].startswith ("/" )]
59-
60- # Sort closing tags by ascending length (so outer closes last)
61- closing_tags .sort (key = lambda x : x [1 ])
62- for tag , _ in closing_tags :
63- tagged_content .append (f"<{ tag } >" )
64-
65- # Sort opening tags by descending length (so outer opens first)
66- opening_tags .sort (key = lambda x : x [1 ], reverse = True )
67- for tag , _ in opening_tags :
68- tagged_content .append (f"<{ tag } >" )
69-
7025 current_idx = index
71- tagged_content .append (text [current_idx :])
72-
73- return "" .join (tagged_content )
74-
75- def call2 (self , annotated_text : Dict [str , Any ]) -> str :
76- """Provide an XML version of the given text and annotations.
77-
78- Args:
79- annotated_text: a dictionary containing the plain text and the
80- extracted annotations.
81-
82- Returns:
83- A string with the XML-version of the content.
84- """
85- tag_indices = defaultdict (list )
26+ tagged_content .extend (tags )
8627
87- for start , end , label in sorted (annotated_text ["label" ]):
88- tag_indices [start ].append (label )
89- tag_indices [end ].append ("/" + label )
90-
91- current_idx = 0
92- tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' ]
93- text = annotated_text ["text" ]
94- for index , tags in sorted (tag_indices .items ()):
95- tagged_content .append (text [current_idx :index ])
96- # close tags
97- tagged_content .extend (
98- [
99- "<" + tag + ">"
100- for tag in sorted (tags , reverse = True )
101- if tag .startswith ("/" )
102- ]
103- )
104- # open tags
105- tagged_content .extend (
106- ["<" + tag + ">" for tag in sorted (tags ) if not tag .startswith ("/" )]
107- )
108- current_idx = index
10928 tagged_content .append (text [current_idx :])
110-
29+ tagged_content . append ( " \n </content>" )
11130 return "" .join (tagged_content )
0 commit comments