-
Notifications
You must be signed in to change notification settings - Fork 34
Expand file tree
/
Copy pathxml.py
More file actions
30 lines (22 loc) · 1.01 KB
/
xml.py
File metadata and controls
30 lines (22 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""XML Annotation processor."""
from collections import defaultdict
from typing import Dict, Any
from inscriptis.annotation.output import AnnotationProcessor
class XmlExtractor(AnnotationProcessor):
"""Provide the converted text with XML-style annotations."""
verbatim = True
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
tag_dict = defaultdict(list)
for start, end, tag in reversed(annotated_text["label"]):
tag_dict[start].append(f"<{tag}>")
tag_dict[end].insert(0, f"</{tag}>")
current_idx = 0
text = annotated_text["text"]
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
for idx, tags in sorted(tag_dict.items()):
tagged_content.append(text[current_idx:idx])
current_idx = idx
tagged_content.extend(tags)
tagged_content.append(text[current_idx:])
tagged_content.append("\n</content>")
return "".join(tagged_content)