Skip to content

Commit 986a391

Browse files
committed
use defusedxml for xml parsing
1 parent 426b5fb commit 986a391

2 files changed

Lines changed: 12 additions & 12 deletions

File tree

colrev/env/tei_parser.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66
import re
77
import typing
88
from pathlib import Path
9-
from xml.etree.ElementTree import ElementTree as StdElementTree
10-
from xml.etree.ElementTree import ParseError
11-
from xml.etree.ElementTree import register_namespace
12-
from xml.etree.ElementTree import tostring
9+
from xml.etree.ElementTree import (
10+
register_namespace,
11+
) # nosec B405 - namespace registration only
1312

1413
import requests
1514
from defusedxml import ElementTree as DefusedET
15+
from defusedxml.common import DefusedXmlException
1616

1717
import colrev.env.grobid_service
1818
import colrev.exceptions as colrev_exceptions
@@ -147,8 +147,8 @@ def _create_tei(self) -> None:
147147
xml_fstring = file.read()
148148
self.root = DefusedET.fromstring(xml_fstring)
149149

150-
tree = StdElementTree(self.root)
151-
tree.write(str(self.tei_path), encoding="utf-8")
150+
with open(self.tei_path, "wb") as file:
151+
file.write(DefusedET.tostring(self.root, encoding="utf-8"))
152152
except requests.exceptions.ConnectionError as exc: # pragma: no cover
153153
print(exc)
154154
print(str(self.pdf_path))
@@ -158,8 +158,8 @@ def get_tei_str(self) -> str:
158158
"""Get the TEI string."""
159159
try:
160160
register_namespace("tei", "http://www.tei-c.org/ns/1.0")
161-
return tostring(self.root, encoding="unicode")
162-
except ParseError as exc: # pragma: no cover
161+
return DefusedET.tostring(self.root, encoding="unicode")
162+
except (DefusedET.ParseError, DefusedXmlException) as exc: # pragma: no cover
163163
raise colrev_exceptions.TEIException from exc
164164

165165
def get_grobid_version(self) -> str:
@@ -731,8 +731,8 @@ def mark_references(self, *, records: dict): # type: ignore
731731
# if settings file available: dedupe_io match agains records
732732

733733
if self.tei_path:
734-
tree = StdElementTree(self.root)
735-
tree.write(str(self.tei_path))
734+
with open(self.tei_path, "wb") as file:
735+
file.write(DefusedET.tostring(self.root, encoding="utf-8"))
736736

737737
return self.root
738738

colrev/packages/pubmed/src/pubmed_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
import typing
88
from sqlite3 import OperationalError
99
from xml.etree.ElementTree import Element # nosec
10-
from xml.etree.ElementTree import ParseError
1110

1211
import requests
1312
from defusedxml import ElementTree as DefusedET
13+
from defusedxml.common import DefusedXmlException
1414

1515
import colrev.exceptions as colrev_exceptions
1616
import colrev.record.record
@@ -199,7 +199,7 @@ def query_id(self, *, pubmed_id: str) -> colrev.record.record.Record:
199199
return retrieved_record
200200
except requests.exceptions.RequestException as exc:
201201
raise PubmedAPIError from exc
202-
except ParseError as exc:
202+
except (DefusedET.ParseError, DefusedXmlException) as exc:
203203
raise colrev_exceptions.RecordNotParsableException(
204204
"Error parsing xml"
205205
) from exc

0 commit comments

Comments
 (0)