Skip to content

Commit eaa3280

Browse files
Stringfy all children (close #158)
1 parent 374a336 commit eaa3280

4 files changed

Lines changed: 15 additions & 17 deletions

File tree

pubmed_parser/medline_parser.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,10 +579,10 @@ def parse_article_info(
579579
medline = pubmed_article.find("MedlineCitation")
580580
article = medline.find("Article")
581581

582-
if article.find("ArticleTitle") is not None:
583-
title = stringify_children(article.find("ArticleTitle")).strip() or ""
584-
else:
585-
title = ""
582+
try:
583+
title = stringify_children(article.find("ArticleTitle")) or None
584+
except AttributeError:
585+
title = None
586586

587587
if article.find("Journal/JournalIssue/Volume") is not None:
588588
volume = article.find("Journal/JournalIssue/Volume").text or ""

pubmed_parser/pubmed_oa_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ def parse_pubmed_caption(path):
464464

465465
fig_captions = fig.find("caption")
466466
if fig_captions is not None:
467-
fig_captions = fig_captions.getchildren()
467+
fig_captions = fig_captions.getchildren()[:1]
468468
caption = " ".join([stringify_children(c) for c in fig_captions])
469469

470470
graphic = fig.find("graphic")

pubmed_parser/utils.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,8 @@ def read_xml(path, nxml=False):
4141

4242

4343
def stringify_children(node):
44-
"""
45-
Filters and removes possible Nones in texts and tails
46-
ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
47-
"""
48-
parts = (
49-
[node.text]
50-
+ list(chain(*([c.text, c.tail] for c in node.getchildren())))
51-
+ [node.tail]
52-
)
53-
return "".join(filter(None, parts))
44+
"""Joins all string parts excluding empty parts."""
45+
return "".join(text.strip() for text in node.itertext() if text)
5446

5547

5648
def stringify_affiliation(node):

tests/test_pubmed_oa_parser.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,18 @@ def test_parse_pubmed_caption():
110110

111111
def test_parse_pubmed_caption_content():
112112
"""This is a test for the caption content."""
113-
fig_caption = 'Aerosol delivery of sACE22.v2.4‐IgG1 alleviates lung injury and improves survival of SARS‐CoV‐2 gamma variant infected K18‐hACE2 transgenic mice \n\n'
113+
fig_caption = 'Aerosol delivery of sACE22.v2.4‐IgG1 alleviates lung injury and improves survival of SARS‐CoV‐2 gamma variant infected K18‐hACE2 transgenic mice'
114114
assert captions_9539395[0]['fig_caption'] == fig_caption
115115
assert captions_9539395[0]['fig_id'] == 'emmm202216109-fig-0001'
116116
assert captions_9539395[0]['fig_label'] == 'Figure 1'
117117
assert captions_9539395[8]['fig_label'] is None
118-
fig_list_items = [('A', 'K18‐hACE2 transgenic mice were inoculated with SARS‐CoV‐2 isolate /Japan/TY7‐503/2021 (gamma variant) at 1\u2009×\u2009104 PFU. sACE22.v2.4‐IgG1 (7.5\u2009ml at 8.3\u2009mg/ml in PBS) was delivered to the mice by a nebulizer in 25\u2009min at 12\u2009h, 48\u2009h, and 84\u2009h postinoculation. PBS was aerosol delivered as control.'), ('B, C', 'Survival (B) and weight loss (C). N\u2009=\u200910 mice for each group. The P‐value of the survival curve by the Gehan–Breslow–Wilcoxon test is shown. Error bars for mouse weight are centered on the mean and show SEM.'), ('D', "Viral load in the lung was measured by RT–qPCR on Day 7. The mRNA expression levels of SARS‐CoV‐2 Spike, Nsp, and Rdrp are normalized to the housekeeping gene peptidylprolyl isomerase A (Ppia). Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('E', "Cytokine expression levels of Tnfa, Ifng, Il1a, and Il1b were measured by RT–qPCR normalized by Ppia. Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('F, G', 'Representative H&E staining of lung sections on Day 7 postinoculation for control PBS group (F) and inhalation of the sACE22.v2.4‐IgG1 group (G). Images at left are low magnifications. Boxed regions (black) are shown at higher magnification on the right. Lungs from 4 independent mice were sectioned, stained, and imaged.')]
118+
fig_list_items = [
119+
('A', 'K18‐hACE2 transgenic mice were inoculated with SARS‐CoV‐2 isolate /Japan/TY7‐503/2021 (gamma variant) at 1\u2009×\u2009104PFU. sACE22.v2.4‐IgG1 (7.5\u2009ml at 8.3\u2009mg/ml in PBS) was delivered to the mice by a nebulizer in 25\u2009min at 12\u2009h, 48\u2009h, and 84\u2009h postinoculation. PBS was aerosol delivered as control.'),
120+
('B, C', 'Survival (B) and weight loss (C).N=\u200910 mice for each group. The P‐value of the survival curve by the Gehan–Breslow–Wilcoxon test is shown. Error bars for mouse weight are centered on the mean and show SEM.'),
121+
('D', "Viral load in the lung was measured by RT–qPCR on Day 7. The mRNA expression levels of SARS‐CoV‐2 Spike, Nsp, and Rdrp are normalized to the housekeeping gene peptidylprolyl isomerase A (Ppia). Data are presented as mean\u2009±\u2009SEM,N=\u20094 mice per group. *P<\u20090.05 by the unpaired Student'st‐test with two‐sided."),
122+
('E', "Cytokine expression levels of Tnfa, Ifng, Il1a, and Il1b were measured by RT–qPCR normalized by Ppia. Data are presented as mean\u2009±\u2009SEM,N=\u20094 mice per group. *P<\u20090.05 by the unpaired Student'st‐test with two‐sided."),
123+
('F, G', 'Representative H&E staining of lung sections on Day 7 postinoculation for control PBS group (F) and inhalation of the sACE22.v2.4‐IgG1 group (G). Images at left are low magnifications. Boxed regions (black) are shown at higher magnification on the right. Lungs from 4 independent mice were sectioned, stained, and imaged.')
124+
]
119125
assert captions_9539395[0]['fig_list-items'] == fig_list_items
120126
assert captions_9539395[0]['graphic_ref'] == 'EMMM-14-e16109-g008'
121127
assert captions_9539395[8]['graphic_ref'] is None

0 commit comments

Comments
 (0)