@@ -740,17 +740,29 @@ def parse_medline_xml(
740740 ------
741741 An iterator of dictionary containing information about articles in NLM format.
742742 see `parse_article_info`). Articles that have been deleted will be
743- added with no information other than the field `delete` being `True`
743+ added with no information other than the fields `delete` being `True`,
744+ and `pmid`.
744745
745746 Examples
746747 --------
747748 >>> article_iterator = pubmed_parser.parse_medline_xml('data/pubmed20n0014.xml.gz')
748749 >>> for article in article_iterator:
749- ... print(article['title'])
750+ ... if article.get('delete'):
751+ ... print(f"Deleted PMID: {article['pmid']}")
752+ ... else:
753+ ... print(article['title'])
750754 """
751755 with gzip .open (path , "rb" ) as f :
752756 for event , element in etree .iterparse (f , events = ("end" ,)):
753- if element .tag == "PubmedArticle" :
757+ # Handle <DeleteCitation> elements, indicating articles removed from PubMed.
758+ if element .tag == "DeleteCitation" :
759+ # These elements are expected to contain one or more PMID tags.
760+ for child in element .iterchildren ():
761+ assert child .tag == "PMID" , f"PMID tag expected. Got: { child .tag } "
762+ yield {"pmid" : child .text , "delete" : True }
763+ element .clear ()
764+
765+ elif element .tag == "PubmedArticle" :
754766 res = parse_article_info (
755767 element ,
756768 year_info_only ,
0 commit comments