-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb2pubmed.qmd
More file actions
338 lines (279 loc) · 12.2 KB
/
web2pubmed.qmd
File metadata and controls
338 lines (279 loc) · 12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
---
title: "web2pubmed"
author: "lizaharrison"
format: html
---
Code downloaded from
[Github](https://github.com/lizaharrison/web2pubmed/blob/master/retrieval_cleansing_functions.py)
```{r}
#| label: Load R libraries
library(reticulate)
```
```{r}
#| label: Install Python packages
library(reticulate)
py_install(c("csv",
"eutils",
"requests",
"lxml",
"pandas",
"itertools",
"re",
"time"),
method="conda" )
```
```{python}
#| label: Python code from GitHub
"""
Author: Eliza Harrison
This module contains the function used to retrieve PubMed article metadata from the PubMed
database using the NCBI E-Utilities API, as well as all functions used for the cleansing and
preparation of PubMed and webpage data.
"""
import csv
import eutils
import requests
import lxml
import pandas as pd
import itertools
import re
import time
# E-UTILITIES PUBMED RETRIEVAL FUNCTION
def eutils_from_df(input_df, chunksize, output_csv):
"""
Retrieves and saves PubMed article content from PubMed via E-Utilities API to CSV file
for set of PMIDs contained within Pandas Dataframe.
Args:
input_df: object name for Dataframe containing PMIDs of interest
chunksize: number of PMIDs to pass to API
output_csv: filename for CSV file to which article content will be saved
Returns:
CSV file with rows pertaining to article content for each PMID in input_csv.
Columns correspond to fields retrieved via efetch client:
'PMID', 'Year', 'Title', 'Abstract', 'Authors', 'Journal', 'Volume', 'Issue',
'Pages', 'DOI', 'PMC'
List and dataframe containing all PubMed article data successfully retrieved from database
"""
# Specifies names for output csv column headers
fieldnames = ['PMID',
'Year',
'Title',
'Abstract',
'Authors',
'Journal',
'Volume',
'Issue',
'Pages',
'DOI',
'PMC',
]
# Creates generator object containing each row in the input dataframe
pm_chunks_gen = (input_df[i:i + chunksize] for i in range(0, len(input_df), chunksize))
# Initialises empty list for compilation of article dictionaries into single container
pm_article_list = []
# Initialise eutils client to access NCBI E-Utilities API
ec = eutils.Client()
# Open CSV file to which each PubMed IDs downloaded data appended as a new row with specified column names
with open(output_csv, 'a') as datafile:
writer = csv.DictWriter(datafile,
fieldnames=fieldnames,
)
writer.writeheader()
# Converts each chunk of PubMed IDs from dataframe to list
for chunk_count, chunk in zip(range(0, len(input_df)), pm_chunks_gen):
try:
index_list = list(chunk.index.values)
chunk_list = list(chunk['PMID'])
print('Chunk No. ' + str(chunk_count))
# Passes chunk of PubMed IDs to E-Utilities API
# Returns iterator object containing key data for each PubMed ID
pm_article_set = iter(ec.efetch(db='pubmed',
id=chunk_list,
)
)
# Assigns each PubMed ID an index value
# Iterates over pm_article_set to access data for each individual PubMed ID
for id_index, id_value in enumerate(chunk_list):
print(index_list[id_index], id_value)
try:
# For each PMID index/value pair, iterates through article set
# Aggregates key article attributes for each PubMed ID into dictionary
pm_article = next(pm_article_set)
pm_article_content = dict(
PMID=str(pm_article.pmid),
Year=str(pm_article.year),
Title=str(pm_article.title),
Abstract=str(pm_article.abstract),
Authors=str(pm_article.authors),
Journal=str(pm_article.jrnl),
Volume=str(pm_article.volume),
Issue=str(pm_article.issue),
Pages=str(pm_article.pages),
DOI=str(pm_article.doi),
PMC=str(pm_article.pmc),
)
print(pm_article_content)
print(pm_article.pmid + ' - Download from Enterez complete')
# Saves dictionary as new item in list for later construction of dataframe
pm_article_list.append(pm_article_content)
print(pm_article.pmid + ' - Save to list complete')
# Writes dictionary to new row of csv file for future reference
writer.writerow(pm_article_content)
print(pm_article.pmid + ' - Write Data to CSV Complete')
# Except statements for content errors
except (StopIteration,
TypeError,
NameError,
ValueError,
lxml.etree.XMLSyntaxError,
eutils.exceptions.EutilsNCBIError,
) as e1:
print('Error: ' + str(e1))
continue
# Except statements for network/connection errors
except(TimeoutError,
RuntimeError,
ConnectionError,
ConnectionResetError,
eutils.exceptions.EutilsRequestError,
requests.exceptions.ConnectionError,
) as e2:
print('Error: ' + str(e2))
time.sleep(10)
continue
except StopIteration:
print('All downloads complete')
break
# Save list of dictionaries to dataframe & write to CSV file
pm_article_df = pd.DataFrame.from_records(pm_article_list,
columns=fieldnames,
)
print('Save to DataFrame complete')
datafile.close()
return pm_article_df
# LONGEST COMMON SUBSTRING ANALYSIS FUNCTIONS
def lcs_algorithm(str1, str2):
"""
Extracts the longest common substring (words) between two strings.
Args:
str1: Input string 1
str2: Input string 2
Returns:
lcs_set: The longest common substring shared between the two input strings.
SOURCE: Code adapted from
https://www.bogotobogo.com/python/python_longest_common_substring_lcs_algorithm_generalized_suffix_tree.php
NOTE: Commented out sections correspond to code that allows for saving of more than one
longest common substring (where len(lcs) is the same).
"""
# str1_words = ''.join(str1.split())
# str2_words = ''.join(str2.split())
# Removes punctuation from string to prevent premature termination of longest common substring
str1 = re.sub(r'[^\w\s]', '', str1)
str2 = re.sub(r'[^\w\s]', '', str2)
# Splits string into tuple of words, to compute lcs by word (vs character)
str1_words = tuple(word for word in str1.lower().split())
str2_words = tuple(word for word in str2.lower().split())
m = len(str1_words)
n = len(str2_words)
matrix = [[0] * (n + 1) for i in range(m + 1)]
longest = 0
lcs_set = set()
for i in range(m):
for j in range(n):
if str1_words[i] == str2_words[j]:
x = matrix[i][j] + 1
matrix[i+1][j+1] = x
if x > longest:
longest = x
lcs_set = set()
lcs_set.add(str1_words[i - x + 1: i + 1])
else:
pass
lcs = [' '.join(tup) for tup in lcs_set]
return lcs
def lcs_analysis(series, min_similarity):
"""
Calculates and extracts longest common substring (lcs) between pairs of strings.
Args:
series:
min_similarity:
Returns:
lcs_df:
NOTE: Commented out sections correspond to code that allows for saving of more than one
longest common substring (where len(lcs) is the same).
"""
t0 = time.time()
print('Beginning Longest Common Substring analysis...')
# Removes all whitespace from series values to reduce size
# series = series.apply(lambda x: ''.join(x.split())) # This is stopping it from working on the title dataset
# Generates all distinct pairs of series values
article_pairs = itertools.combinations(series.index, 2)
# Initialises empty list in which indices, lcs and %age match can be stored
lcs_list = [[], [], [], []]
for pair in article_pairs:
index_1, index_2 = pair
if index_1 != index_2:
str1 = series[index_1]
str2 = series[index_2]
lcs = lcs_algorithm(str1, str2)
if len(lcs) > 0:
pct = (len(lcs[0]) / max(len(str1), len(str2))) * 100
# pct = len(lcs[0]) / max(len(str1), len(str2)) * 100
if pct > min_similarity:
print('%s - %s Longest common substring > min threshold' % (index_1, index_2))
lcs_list[0].append(index_1)
lcs_list[1].append(index_2)
lcs_list[2].append(lcs)
lcs_list[3].append(pct)
else:
pass
else:
pass
else:
pass
print('Longest common substring analysis of all records complete')
lcs_df = pd.DataFrame(lcs_list).transpose()
lcs_df.columns = ['article id 1', 'article id 2', 'lcs', '%age common']
t1 = time.time()
elapsed = round(t1 - t0, 2)
print('%s seconds / %s minutes elapsed/n' % (elapsed, elapsed / 60))
return lcs_df
# SELECTION OF 1:1 PMID-URL LINKS
def select_web_records(for_selection, full_corpus):
subset = for_selection.loc[(~for_selection['pmid'].isin(full_corpus['pmid'].values))
& (~for_selection['url'].isin(full_corpus['url'].values))]
selected_web_ids = []
selected_urls = []
for pmid in subset.loc[:, 'pmid'].unique():
web_ids = subset.loc[subset['pmid'] == pmid].index.values
urls = subset.loc[subset['pmid'] == pmid, 'url'].values
corpus_text = subset.loc[subset['pmid'] == pmid, 'corpus_text'].values
text_lengths = [len(text) for text in corpus_text]
not_in_corpus = True
while not_in_corpus is True:
if len(web_ids) > 1:
i = text_lengths.index(max(text_lengths)) # for selecting longest web article
# i = random.randint(0, len(web_ids) - 1) # for selecting random url
elif len(web_ids) == 1:
i = 0
else:
not_in_corpus = False
break
selected_id = web_ids[i]
selected_url = urls[i]
selected_corpus_text = corpus_text[i]
if selected_url in selected_urls or \
selected_url in full_corpus['url'].values:
web_ids = [x for x in web_ids if x != selected_id]
urls = [x for x in urls if x != selected_url]
corpus_text = [x for x in corpus_text if x != selected_corpus_text]
text_lengths = [len(text) for text in corpus_text]
not_in_corpus = True
else:
selected_web_ids.append(selected_id)
selected_urls.append(selected_url)
not_in_corpus = False
print('Web record %s added to final corpus (%s)' % (selected_id, selected_url))
full_corpus_updated = full_corpus.append(subset.loc[selected_web_ids])
return full_corpus_updated
```