Skip to content

Commit 1559018

Browse files
authored
Restore handling of DeleteCitation and ensure memory safety (close #166)
1 parent eaa3280 commit 1559018

1 file changed

Lines changed: 15 additions & 3 deletions

File tree

pubmed_parser/medline_parser.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -740,17 +740,29 @@ def parse_medline_xml(
740740
------
741741
An iterator of dictionary containing information about articles in NLM format.
742742
see `parse_article_info`). Articles that have been deleted will be
743-
added with no information other than the field `delete` being `True`
743+
added with no information other than the fields `delete` being `True`,
744+
and `pmid`.
744745
745746
Examples
746747
--------
747748
>>> article_iterator = pubmed_parser.parse_medline_xml('data/pubmed20n0014.xml.gz')
748749
>>> for article in article_iterator:
749-
... print(article['title'])
750+
... if article.get('delete'):
751+
... print(f"Deleted PMID: {article['pmid']}")
752+
... else:
753+
... print(article['title'])
750754
"""
751755
with gzip.open(path, "rb") as f:
752756
for event, element in etree.iterparse(f, events=("end",)):
753-
if element.tag == "PubmedArticle":
757+
# Handle <DeleteCitation> elements, indicating articles removed from PubMed.
758+
if element.tag == "DeleteCitation":
759+
# These elements are expected to contain one or more PMID tags.
760+
for child in element.iterchildren():
761+
assert child.tag == "PMID", f"PMID tag expected. Got: {child.tag}"
762+
yield {"pmid": child.text, "delete": True}
763+
element.clear()
764+
765+
elif element.tag == "PubmedArticle":
754766
res = parse_article_info(
755767
element,
756768
year_info_only,

0 commit comments

Comments
 (0)