@@ -173,8 +173,7 @@ def add_data_sets(sets_set):
173173 try :
174174 obj , created = DataSetType .objects .update_or_create (name = dss ['name' ], data_type = dss ['data_type' ], set_type = dss ['set_type' ])
175175
176- print ("[STATUS] Data Set Type created:" )
177- print (obj )
176+ logger .info ("[STATUS] Data Set Type created: {}" .format (obj ))
178177 except Exception as e :
179178 msg = "Data Version {} may not have been added!" .format (dss ['name' ])
180179 ERRORS_SEEN .append (msg )
@@ -219,8 +218,7 @@ def add_programs(program_set):
219218 short_name = prog ['short_name' ], name = prog ['full_name' ], is_public = prog ['public' ],
220219 owner = User .objects .get (email = prog ['owner' ]) if 'owner' in prog else idc_superuser )
221220
222- print ("Program created:" )
223- print (obj )
221+ logger .info ("[STATUS] Program created: {}" .format (obj ))
224222
225223 results [obj .short_name ] = obj
226224
@@ -281,12 +279,14 @@ def add_data_source(name, count_col, source_type, versions, programs, aggregate_
281279 )
282280 copy_attrs ([attr_from ], [name ], attr_exclude )
283281
284- print ("[STATUS] DataSource entry created for: {}" .format (obj .name ))
282+ logger . info ("[STATUS] DataSource entry created for: {}" .format (obj .name ))
285283 except Exception as e :
286284 msg = "DataSource {} may not have been added!" .format (obj .name if obj else 'Unknown' )
287- ERRORS_SEEN .append (msg )
285+ clarifier = "Attributes are copied from a DataSource ORM object matched on the name, NOT from BigQuery directly! Check to make sure you have the correct attribute source name in the ETL config file."
286+ ERRORS_SEEN .append (msg + "\n " + clarifier )
288287 logger .error ("[ERROR] {}" .format (msg ))
289288 logger .exception (e )
289+ logger .error ("[ERROR] {}" .format (clarifier ))
290290
291291
292292def add_source_joins (froms , from_col , tos = None , to_col = None ):
@@ -322,26 +322,27 @@ def add_source_joins(froms, from_col, tos=None, to_col=None):
322322def load_citations (filename ):
323323 try :
324324 cites_file = open (filename ,"r" )
325- current_cites = [x .doi for x in Citation .objects .all ()]
325+ current_cites = [x .doi . lower () for x in Citation .objects .all ()]
326326 new_cites = []
327327 updated_cites = {}
328328 for line in csv_reader (cites_file ):
329- if "doi, citation " in line :
330- print ("[STATUS] Saw header line during citation load - skipping!" )
329+ if "source_doi " in line :
330+ logger . info ("[STATUS] Saw header line during citation load - skipping!" )
331331 continue
332- if line [0 ] in current_cites :
333- updated_cites [line [0 ]] = line [1 ]
332+ if line [0 ]. lower () in current_cites :
333+ updated_cites [line [0 ]. lower () ] = { "doi" : line [0 ], "cite" : line [ 1 ]}
334334 else :
335335 new_cites .append (Citation (doi = line [0 ], cite = line [1 ]))
336336 if len (new_cites ):
337337 Citation .objects .bulk_create (new_cites )
338- print ("[STATUS] The following {} DOI citations were added: {}" .format (len (new_cites ), " " .join ([x .doi for x in new_cites ])))
338+ logger . info ("[STATUS] The following {} DOI citations were added: {}" .format (len (new_cites ), " " .join ([x .doi for x in new_cites ])))
339339 if len (updated_cites ):
340340 to_update = Citation .objects .filter (doi__in = updated_cites .keys ())
341341 for upd in to_update :
342- upd .cite = updated_cites [upd .doi ]
343- Citation .objects .bulk_update (to_update , ["cite" ])
344- print ("[STATUS] {} DOI citations were updated." .format (len (updated_cites )))
342+ upd .cite = updated_cites [upd .doi .lower ()]["cite" ]
343+ upd .doi = updated_cites [upd .doi .lower ()]["doi" ]
344+ Citation .objects .bulk_update (to_update , ["doi" , "cite" ])
345+ logger .info ("[STATUS] {} DOI citations were updated." .format (len (updated_cites )))
345346 except Exception as e :
346347 ERRORS_SEEN .append ("Error seen while loading citations, check the logs!" )
347348 logger .error ("[ERROR] While trying to load citations: " )
@@ -356,11 +357,11 @@ def load_collections(filename, data_version="8.0"):
356357 exact_collection_fields = [
357358 "collection_id" , "collection_uuid" , "name" , "collections" , "image_types" , "supporting_data" , "subject_count" , "doi" ,
358359 "source_url" , "cancer_type" , "species" , "location" , "analysis_artifacts" , "description" , "collection_type" ,
359- "access" , "date_updated" , "active" , "total_size" , "total_size_with_ar" ]
360+ "access" , "date_updated" , "active" ,"total_size" , "total_size_with_ar" ]
360361 field_map = FIELD_MAP
361362 for line in csv_reader (collection_file ):
362363 if COLLECTION_HEADER_CHK in line :
363- print ("[STATUS] Header found - mappping attributes." )
364+ logger . info ("[STATUS] Header found - mappping attributes." )
364365 i = 0
365366 field_map = {}
366367 for field in line :
@@ -371,6 +372,7 @@ def load_collections(filename, data_version="8.0"):
371372 'data' : { x : line [field_map [x ]] for x in exact_collection_fields },
372373 "data_versions" : [{"ver" : data_version , "name" : "TCIA Image Data" }]
373374 }
375+ collex ['data' ]['license' ] = line [field_map ["license_short_name" ]]
374376 collex ['data' ]['nbia_collection_id' ] = line [field_map ['tcia_wiki_collection_id' ]]
375377 collex ['data' ]['tcia_collection_id' ] = line [field_map ['tcia_wiki_collection_id' ]]
376378 collex ['data' ]['active' ] = bool ((line [field_map ['active' ]]).lower () == "true" )
@@ -612,7 +614,7 @@ def copy_attrs(from_data_sources, to_data_sources, attr_excludes):
612614
613615 for fds in from_sources :
614616 from_source_attrs = fds .attribute_set .exclude (id__in = to_sources_attrs ['ids' ]).exclude (name__in = attr_excludes )
615- print ( " Copying {} attributes from {} to: {}." .format (
617+ logger . info ( "[STATUS] Copying {} attributes from {} to: {}." .format (
616618 len (from_source_attrs .values_list ('name' ,flat = True )),
617619 fds .name , "; " .join (to_data_sources ),
618620
@@ -817,20 +819,19 @@ def parse_args():
817819 parser .add_argument ('-s' , '--solr-files-only' , type = str , default = '' , help = solr_msg )
818820 return parser .parse_args ()
819821
820-
821822def main ():
822823
823824 try :
824825 if len (sys .argv ) <= 1 :
825- print ("Use -h to access the help description." )
826+ logger . info ("Use -h to access the help description." )
826827 exit (0 )
827828
828829 args = parse_args ()
829830
830831 # Load the configuration file into ETL_CONFIG and run data version and data source creation
831832 # This will copy over any attributes from prior versions indicated in the JSON config
832833 # Note that the config file is only required for 'full ETL' i.e. creation of new versions and
833- # deprecation of prior ones; it can be omitted to perform piecemeal updates eg. to collections
834+ # deprecation of prior ones and running BQ queries. It can be omitted to perform piecemeal updates eg. to
834835 # metadata
835836 len (args .config_file ) and update_data_versions (args .config_file )
836837
@@ -860,7 +861,7 @@ def main():
860861 attr_obj = Attribute .objects .get (name = attr )
861862 update_display_values (attr_obj , dvals [attr ]['vals' ])
862863 except ObjectDoesNotExist as e :
863- print ("[WARNING] Attr {} not found - display values will not be updated! Rerun ETL if this is not expected." .format (attr ))
864+ logger . warning ("[WARNING] Attr {} not found - display values will not be updated! Rerun ETL if this is not expected." .format (attr ))
864865
865866 # Solr commands are automatically output for full ETL; the step below is for outside-of-ETL executions
866867 if len (ETL_CONFIG ):
@@ -886,7 +887,7 @@ def main():
886887 logger .exception (e )
887888 if len (ERRORS_SEEN ):
888889 for err in ERRORS_SEEN :
889- print ("-> {}" .format (err ))
890+ logger . error ("-> {}" .format (err ))
890891
891892
892893if __name__ == "__main__" :
0 commit comments