11#!/usr/bin/env python
22# ------------------------------------------------------------------------------
3- # Utility script which adds metadata headers to specified file(s)
4- # Metadata is pulled from the clinical data dictionary web service (cdd)
5- # Four lines added at the top (display name, dscriptions, datatype, priority)
6- # Changes are only made if all input files are valid
3+ # Utility script which adds metadata headers to the specified clinical file(s).
4+ #
5+ # Metadata is loaded exclusively from a JSON file (-i / --independent-metadata-file)
6+ # that maps normalized column header names to their metadata objects. Each entry
7+ # in the JSON file must have the following keys:
8+ # DISPLAY_NAME, DESCRIPTIONS, DATATYPE, ATTRIBUTE_TYPE, PRIORITY
9+ #
10+ # Default behavior for attributes not found in the JSON file:
11+ # DISPLAY_NAME : attribute name with underscores replaced by spaces, title-cased
12+ # DESCRIPTIONS : same as DISPLAY_NAME default
13+ # DATATYPE : STRING
14+ # ATTRIBUTE_TYPE: SAMPLE
15+ # PRIORITY : 1
16+ #
17+ # The script rewrites each input file in-place, prepending four metadata header
18+ # lines (display name, description, datatype, priority). A fifth attribute_type
19+ # header line is included only when the file is not a standard patient or sample
20+ # clinical file (i.e., not data_clinical_patient.txt or data_clinical_sample.txt).
21+ #
22+ # Changes are only made if all input files are valid (exist and are writable).
723# ------------------------------------------------------------------------------
824
925from clinicalfile_utils import write_data , write_header_line , get_header
1026import argparse
1127import json
1228import os
13- import requests
1429import shutil
1530import sys
1631import tempfile
2439COLUMN_HEADER_KEY = 'column_header'
2540ATTRIBUTE_TYPE_KEY = 'attribute_type'
2641PRIORITY_KEY = 'priority'
27- OVERRIDDEN_STUDY_NAME_KEY = 'name'
28- DEFAULT_URL = "https://cdd.cbioportal.mskcc.org/api/"
2942
3043PATIENT_CLINICAL_FILE_PATTERN = "data_clinical_patient.txt"
3144SAMPLE_CLINICAL_FILE_PATTERN = "data_clinical_sample.txt"
3245
33- def check_valid_studyid (study_id , base_cdd_url ):
34- query = base_cdd_url + "cancerStudies"
35- response = requests .get (query )
36- response_as_json = json .loads (response .text )
37- if study_id not in [overridden_study [OVERRIDDEN_STUDY_NAME_KEY ] for overridden_study in response_as_json ]:
38- print >> ERROR_FILE , 'Invalid study id: ' + study_id + ", there is no associated attributes in CDD"
39- sys .exit (2 )
40-
41- def response_is_200 (response ):
42- if response .status_code != 200 :
43- return False
44- return True
46+ def get_metadata_dictionary (all_attributes , metadata_file ):
47+ """Load metadata for the given attributes from the JSON metadata file.
4548
46- def get_independently_determined_attributes_metadata_dictionary (all_attributes , independent_metadata_file ):
49+ Returns a dict mapping normalized column header -> metadata object for
50+ every attribute that appears in the file. Attributes absent from the file
51+ are omitted and will receive default values at write time.
52+ """
4753 metadata_dictionary = {}
48- if not independent_metadata_file :
49- return metadata_dictionary
50- f = open (independent_metadata_file , "r" )
51- independently_determined_attributes = json .load (f )
54+ f = open (metadata_file , "r" )
55+ all_metadata = json .load (f )
5256 f .close ()
5357 for normalized_column_header in all_attributes :
54- if normalized_column_header in independently_determined_attributes :
55- metadata = independently_determined_attributes [normalized_column_header ]
58+ if normalized_column_header in all_metadata :
59+ metadata = all_metadata [normalized_column_header ]
5660 metadata_dictionary [normalized_column_header ] = {
5761 'DISPLAY_NAME' : metadata ['DISPLAY_NAME' ],
5862 'DESCRIPTIONS' : metadata ['DESCRIPTIONS' ],
@@ -62,49 +66,6 @@ def get_independently_determined_attributes_metadata_dictionary(all_attributes,
6266 }
6367 return metadata_dictionary
6468
65- def add_clinical_attribute_metadata_from_cdd (study_id , header , base_cdd_url , metadata_mapping ):
66- response = requests .post (base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url , json = list (header ))
67- # ROB : we can probably delete the following block, unless we want to try to succed even when our independently determined metadata is missing a needed attribute
68- # ROB : or change this to a graceful exit reporting the missing headers
69- # ROB VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
70- if not response_is_200 (response ):
71- for attr in header :
72- x = [attr ]
73- response = requests .post (base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url , json = list (x ))
74- if not response_is_200 (response ):
75- continue
76- response_as_json = json .loads (response .text )
77- for entry in response_as_json :
78- normalized_column_header = entry [COLUMN_HEADER_KEY ]
79- display_name = entry [DISPLAY_NAME_KEY ]
80- description = entry [DESCRIPTION_KEY ]
81- datatype = entry [DATATYPE_KEY ]
82- attribute_type = entry [ATTRIBUTE_TYPE_KEY ]
83- priority = entry [PRIORITY_KEY ]
84- metadata_mapping [normalized_column_header ] = {
85- 'DISPLAY_NAME' : display_name ,
86- 'DESCRIPTIONS' : description ,
87- 'DATATYPE' : datatype ,
88- 'ATTRIBUTE_TYPE' : attribute_type ,
89- 'PRIORITY' : priority }
90- print metadata_mapping
91- return metadata_mapping
92- # ROB ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
93- response_as_json = json .loads (response .text )
94- for entry in response_as_json :
95- normalized_column_header = entry [COLUMN_HEADER_KEY ]
96- display_name = entry [DISPLAY_NAME_KEY ]
97- description = entry [DESCRIPTION_KEY ]
98- datatype = entry [DATATYPE_KEY ]
99- attribute_type = entry [ATTRIBUTE_TYPE_KEY ]
100- priority = entry [PRIORITY_KEY ]
101- metadata_mapping [normalized_column_header ] = {
102- 'DISPLAY_NAME' : display_name ,
103- 'DESCRIPTIONS' : description ,
104- 'DATATYPE' : datatype ,
105- 'ATTRIBUTE_TYPE' : attribute_type ,
106- 'PRIORITY' : priority }
107-
10869def write_headers (header , metadata_dictionary , output_file , is_mixed_attribute_types_format ):
10970 name_line = []
11071 description_line = []
@@ -119,7 +80,7 @@ def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_t
11980 attribute_type_line .append (metadata_dictionary [attribute ]['ATTRIBUTE_TYPE' ])
12081 priority_line .append (metadata_dictionary [attribute ]['PRIORITY' ])
12182 else :
122- # if attribute is not defined in cdd, use defaults
83+ # attribute not found in metadata file -- apply defaults
12384 name_line .append (attribute .replace ("_" , " " ).title ())
12485 description_line .append (attribute .replace ("_" , " " ).title ())
12586 datatype_line .append ('STRING' )
@@ -143,19 +104,13 @@ def check_if_mixed_attribute_types_format(filename):
143104def main ():
144105 parser = argparse .ArgumentParser ()
145106 parser .add_argument ("-f" , "--files" , nargs = "+" , help = "file(s) to add metadata headers" , required = True )
146- parser .add_argument ("-s" , "--study-id" , help = "study id for specific overrides" , required = False )
147- parser .add_argument ("-c" , "--cdd-url" , help = "the url for the cdd web application, default is https://cdd.cbioportal.mskcc.org/api/" , required = False )
148- parser .add_argument ("-i" , "--independent-metadata-file" , help = "a file containing a json map from normalized_header to metadata object" , required = False )
107+ parser .add_argument ("-i" , "--independent-metadata-file" , help = "a JSON file containing a map from normalized column header to metadata object" , required = True )
149108 args = parser .parse_args ()
150109 clinical_files = args .files
151- study_id = args .study_id
152- cdd_url = args .cdd_url
153- independent_metadata_file = args .independent_metadata_file
154- # change base url if specified (i.e for testing)
155- if cdd_url :
156- base_cdd_url = cdd_url
157- else :
158- base_cdd_url = DEFAULT_URL
110+ metadata_file = args .independent_metadata_file
111+ if not os .path .exists (metadata_file ):
112+ print >> ERROR_FILE , 'Metadata file not found: ' + metadata_file
113+ sys .exit (2 )
159114 # check file (args) validity and return error if any file fails check
160115 missing_clinical_files = [clinical_file for clinical_file in clinical_files if not os .path .exists (clinical_file )]
161116 if len (missing_clinical_files ) > 0 :
@@ -165,23 +120,15 @@ def main():
165120 if len (not_writable_clinical_files ) > 0 :
166121 print >> ERROR_FILE , 'File(s) not writable: ' + ', ' .join (not_writable_clinical_files )
167122 sys .exit (2 )
168- if (study_id ):
169- check_valid_studyid (study_id , base_cdd_url )
170123 all_attributes = set ()
171124 # get a set of attributes used across all input files
172125 for clinical_file in clinical_files :
173126 all_attributes = all_attributes .union (get_header (clinical_file ))
174- # set metadata for independently determined attributes which are members of all_attributes
175- metadata_dictionary = get_independently_determined_attributes_metadata_dictionary (all_attributes , independent_metadata_file )
176- # get a set of "to be determined by ddp" attributes
177- ddp_dependent_attributes = set ()
178- for attribute in all_attributes :
179- if not attribute in metadata_dictionary :
180- ddp_dependent_attributes .add (attribute )
181- add_clinical_attribute_metadata_from_cdd (study_id , ddp_dependent_attributes , base_cdd_url , metadata_dictionary )
182- # check metadata is defined for all attributes in CDD
183- if len (metadata_dictionary .keys ()) != len (all_attributes ):
184- print >> ERROR_FILE , 'Error, metadata not found for attribute(s): ' + ', ' .join (all_attributes .difference (metadata_dictionary .keys ()))
127+ # load metadata for all attributes from the JSON file; missing attributes get defaults at write time
128+ metadata_dictionary = get_metadata_dictionary (all_attributes , metadata_file )
129+ missing_attributes = all_attributes .difference (metadata_dictionary .keys ())
130+ if missing_attributes :
131+ print >> ERROR_FILE , 'Warning: metadata not found for attribute(s), defaults will be used: ' + ', ' .join (missing_attributes )
185132 for clinical_file in clinical_files :
186133 # create temp file to write to
187134 temp_file , temp_file_name = tempfile .mkstemp ()
0 commit comments