Skip to content

Commit 97381ca

Browse files
authored
Merge pull request #83 from sbassi/subjectsDB
Support for subjects in SQLiteDB
2 parents 5010b37 + 47244b1 commit 97381ca

3 files changed

Lines changed: 36 additions & 20 deletions

File tree

allofplos/corpus/corpus_class.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __repr__(self):
3030
def iter_file_doi(self):
3131
"""Generator that returns filename, doi tuples for every file in the corpus.
3232
33-
Used to generate both DOI and file generators for the corpus.
33+
Used to generate both DOI and file generators for the corpus.
3434
"""
3535
return ((file_, filename_to_doi(file_))
3636
for file_ in os.listdir(self.directory)

allofplos/makedb.py

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,11 @@
1717
DateTimeField, BooleanField, IntegerField, IntegrityError
1818
from playhouse.sqlite_ext import SqliteExtDatabase
1919

20+
from .corpus import Corpus
2021
from .transformations import filename_to_doi, convert_country
22+
from . import starterdir
2123
from .article_class import Article
2224

23-
24-
# TODO: this may need to be updated to take into account the new get_corpus_dir() logic.
25-
# It is not clear, since this is a relative path from the package how this should work without
26-
# diving into the code more thoroughly
27-
corpusdir = 'allofplos/allofplos_xml'
28-
2925
journal_title_dict = {
3026
'PLOS ONE': 'PLOS ONE',
3127
'PLOS GENETICS': 'PLOS Genetics',
@@ -83,6 +79,9 @@ class Country(BaseModel):
8379
class Affiliations(BaseModel):
8480
affiliations = CharField(unique=True)
8581

82+
class Subjects(BaseModel):
83+
subjects = CharField(unique=True)
84+
8685
class CorrespondingAuthor(BaseModel):
8786
corr_author_email = CharField(unique=True)
8887
tld = TextField(null=True)
@@ -105,26 +104,25 @@ class PLOSArticle(BaseModel):
105104
word_count = IntegerField()
106105
JATS_type = ForeignKeyField(JATSType, related_name='jats')
107106

107+
class SubjectsPLOSArticle(BaseModel):
108+
subject = ForeignKeyField(Subjects)
109+
article = ForeignKeyField(PLOSArticle)
110+
108111
class CoAuthorPLOSArticle(BaseModel):
109112
corr_author = ForeignKeyField(CorrespondingAuthor)
110113
article = ForeignKeyField(PLOSArticle)
111114

112115
db.connect()
113-
db.create_tables([Journal, PLOSArticle, ArticleType,
114-
CoAuthorPLOSArticle, CorrespondingAuthor,
115-
JATSType, Affiliations, Country])
116+
db.create_tables([Journal, PLOSArticle, ArticleType, CoAuthorPLOSArticle,
117+
CorrespondingAuthor, JATSType, Affiliations, Country,
118+
SubjectsPLOSArticle, Subjects])
116119

117-
if args.starter:
118-
allfiles = os.listdir('starter_corpus')
119-
else:
120-
allfiles = os.listdir(corpusdir)
121-
if args.random:
122-
randomfiles = random.sample(allfiles, args.random)
123-
max_value = len(randomfiles)
124-
else:
125-
max_value = len(allfiles)
126120

127-
for i, file_ in enumerate(tqdm(randomfiles if args.random else allfiles)):
121+
corpus_dir = starterdir if args.starter else None
122+
allfiles = Corpus(corpus_dir).files
123+
files = random.sample(allfiles, args.random) if args.random else allfiles
124+
125+
for file_ in tqdm(files):
128126
doi = filename_to_doi(file_)
129127
article = Article(doi)
130128
journal_name = journal_title_dict[article.journal.upper()]
@@ -155,6 +153,24 @@ class CoAuthorPLOSArticle(BaseModel):
155153
created_date = article.pubdate,
156154
word_count=article.word_count,
157155
JATS_type = j_type)
156+
# Get subject information
157+
taxonomy_set = set()
158+
taxonomy = article.taxonomy
159+
for values in taxonomy.values():
160+
for value in values:
161+
for taxon in value:
162+
taxonomy_set.add(taxon)
163+
for taxon in taxonomy_set:
164+
with db.atomic() as atomic:
165+
try:
166+
subject = Subjects.create(subjects = taxon)
167+
except (sqlite3.IntegrityError, IntegrityError):
168+
db.rollback()
169+
subject = Subjects.get(Subjects.subjects == taxon)
170+
SubjectsPLOSArticle.create(
171+
subject = subject,
172+
article = p_art
173+
)
158174
if article.authors:
159175
iterable_authors = article.authors
160176
else:

allofplos/starter.db

96 KB
Binary file not shown.

0 commit comments

Comments
 (0)