Skip to content

Commit 2dc637c

Browse files
committed
move main() to update.py
point .plos_corpus.py at update.py delete `main()` from corpus.plos_corpus.py
1 parent 0b4d942 commit 2dc637c

3 files changed

Lines changed: 50 additions & 49 deletions

File tree

allofplos/corpus/plos_corpus.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -641,50 +641,3 @@ def download_corpus_metadata_files(csv_abstracts=True, csv_no_abstracts=True, sq
641641
inF.close()
642642
outF.close()
643643
print("Extraction complete.")
644-
645-
646-
def main():
647-
"""
648-
Entry point for the program. This is used when the program is used as a
649-
standalone script
650-
:return: None
651-
"""
652-
directory = get_corpus_dir()
653-
654-
# Step 0: Initialize first copy of repository
655-
try:
656-
corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
657-
os.path.join(directory, name))]
658-
except FileNotFoundError:
659-
corpus_files = []
660-
if len(corpus_files) < min_files_for_valid_corpus:
661-
print('Not enough articles in {}, re-downloading zip file'.format(directory))
662-
# TODO: check if zip file is in top-level directory before downloading
663-
create_local_plos_corpus()
664-
665-
# Step 1: Query solr via URL and construct DOI list
666-
# Filtered by article type & scheduled for the last 14 days.
667-
# Returns specific URL query & the number of search results.
668-
# Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
669-
# Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
670-
print("Checking for new articles...")
671-
dois_needed_list = get_dois_needed_list()
672-
673-
# Step 2: Download new articles
674-
# For every doi in dois_needed_list, grab the accompanying XML from journal pages
675-
# If no new articles, don't run any other cells
676-
# Check if articles are uncorrected proofs
677-
# Check if amended articles linked to new amendment articles are updated
678-
# Merge new XML into folder
679-
# If need to bulk download, please start here:
680-
# https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
681-
download_check_and_move(dois_needed_list,
682-
uncorrected_proofs_text_list,
683-
tempdir=newarticledir,
684-
destination=get_corpus_dir()
685-
)
686-
return None
687-
688-
689-
if __name__ == "__main__":
690-
main()

allofplos/plos_corpus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import warnings
22

3-
from .corpus.plos_corpus import main
3+
from .update import main
44

55
if __name__ == "__main__":
66
warnings.simplefilter('always', DeprecationWarning)

allofplos/update.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,52 @@
1-
from .corpus.plos_corpus import main
1+
import os
2+
3+
from . import get_corpus_dir, newarticledir, uncorrected_proofs_text_list
4+
from .corpus.plos_corpus import (create_local_plos_corpus, get_dois_needed_list, download_check_and_move,
5+
min_files_for_valid_corpus)
6+
7+
8+
def main():
9+
"""
10+
Entry point for the program. This is used when the program is used as a
11+
standalone script
12+
:return: None
13+
"""
14+
directory = get_corpus_dir()
15+
16+
# Step 0: Initialize first copy of repository
17+
try:
18+
corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
19+
os.path.join(directory, name))]
20+
except FileNotFoundError:
21+
corpus_files = []
22+
if len(corpus_files) < min_files_for_valid_corpus:
23+
print('Not enough articles in {}, re-downloading zip file'.format(directory))
24+
# TODO: check if zip file is in top-level directory before downloading
25+
create_local_plos_corpus()
26+
27+
# Step 1: Query solr via URL and construct DOI list
28+
# Filtered by article type & scheduled for the last 14 days.
29+
# Returns specific URL query & the number of search results.
30+
# Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
31+
# Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
32+
print("Checking for new articles...")
33+
dois_needed_list = get_dois_needed_list()
34+
35+
# Step 2: Download new articles
36+
# For every doi in dois_needed_list, grab the accompanying XML from journal pages
37+
# If no new articles, don't run any other cells
38+
# Check if articles are uncorrected proofs
39+
# Check if amended articles linked to new amendment articles are updated
40+
# Merge new XML into folder
41+
# If need to bulk download, please start here:
42+
# https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
43+
download_check_and_move(dois_needed_list,
44+
uncorrected_proofs_text_list,
45+
tempdir=newarticledir,
46+
destination=get_corpus_dir()
47+
)
48+
return None
49+
250

351
if __name__ == "__main__":
452
main()

0 commit comments

Comments
 (0)