1717 DateTimeField , BooleanField , IntegerField , IntegrityError
1818from playhouse .sqlite_ext import SqliteExtDatabase
1919
20+ from .corpus import Corpus
2021from .transformations import filename_to_doi , convert_country
22+ from . import starterdir
2123from .article_class import Article
2224
23-
24- # TODO: this may need to be updated to take into account the new get_corpus_dir() logic.
25- # It is not clear, since this is a relative path from the package how this should work without
26- # diving into the code more thoroughly
27- corpusdir = 'allofplos/allofplos_xml'
28-
2925journal_title_dict = {
3026 'PLOS ONE' : 'PLOS ONE' ,
3127 'PLOS GENETICS' : 'PLOS Genetics' ,
@@ -83,6 +79,9 @@ class Country(BaseModel):
8379class Affiliations (BaseModel ):
8480 affiliations = CharField (unique = True )
8581
82+ class Subjects (BaseModel ):
83+ subjects = CharField (unique = True )
84+
8685class CorrespondingAuthor (BaseModel ):
8786 corr_author_email = CharField (unique = True )
8887 tld = TextField (null = True )
@@ -105,26 +104,25 @@ class PLOSArticle(BaseModel):
105104 word_count = IntegerField ()
106105 JATS_type = ForeignKeyField (JATSType , related_name = 'jats' )
107106
107+ class SubjectsPLOSArticle (BaseModel ):
108+ subject = ForeignKeyField (Subjects )
109+ article = ForeignKeyField (PLOSArticle )
110+
108111class CoAuthorPLOSArticle (BaseModel ):
109112 corr_author = ForeignKeyField (CorrespondingAuthor )
110113 article = ForeignKeyField (PLOSArticle )
111114
112115db .connect ()
113- db .create_tables ([Journal , PLOSArticle , ArticleType ,
114- CoAuthorPLOSArticle , CorrespondingAuthor ,
115- JATSType , Affiliations , Country ])
116+ db .create_tables ([Journal , PLOSArticle , ArticleType , CoAuthorPLOSArticle ,
117+ CorrespondingAuthor , JATSType , Affiliations , Country ,
118+ SubjectsPLOSArticle , Subjects ])
116119
117- if args .starter :
118- allfiles = os .listdir ('starter_corpus' )
119- else :
120- allfiles = os .listdir (corpusdir )
121- if args .random :
122- randomfiles = random .sample (allfiles , args .random )
123- max_value = len (randomfiles )
124- else :
125- max_value = len (allfiles )
126120
127- for i , file_ in enumerate (tqdm (randomfiles if args .random else allfiles )):
121+ corpus_dir = starterdir if args .starter else None
122+ allfiles = Corpus (corpus_dir ).files
123+ files = random .sample (allfiles , args .random ) if args .random else allfiles
124+
125+ for file_ in tqdm (files ):
128126 doi = filename_to_doi (file_ )
129127 article = Article (doi )
130128 journal_name = journal_title_dict [article .journal .upper ()]
@@ -155,6 +153,24 @@ class CoAuthorPLOSArticle(BaseModel):
155153 created_date = article .pubdate ,
156154 word_count = article .word_count ,
157155 JATS_type = j_type )
156+ # Get subject information
157+ taxonomy_set = set ()
158+ taxonomy = article .taxonomy
159+ for values in taxonomy .values ():
160+ for value in values :
161+ for taxon in value :
162+ taxonomy_set .add (taxon )
163+ for taxon in taxonomy_set :
164+ with db .atomic () as atomic :
165+ try :
166+ subject = Subjects .create (subjects = taxon )
167+ except (sqlite3 .IntegrityError , IntegrityError ):
168+ db .rollback ()
169+ subject = Subjects .get (Subjects .subjects == taxon )
170+ SubjectsPLOSArticle .create (
171+ subject = subject ,
172+ article = p_art
173+ )
158174 if article .authors :
159175 iterable_authors = article .authors
160176 else :
0 commit comments