Skip to content

Commit fb998bd

Browse files
author
Avery Wang
committed
remove references to cdd
replace cdd with a JSON file with overrides
1 parent ef1d9f4 commit fb998bd

1 file changed

Lines changed: 41 additions & 94 deletions

File tree

import-scripts/add_clinical_attribute_metadata_headers.py

Lines changed: 41 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,31 @@
11
#!/usr/bin/env python
22
# ------------------------------------------------------------------------------
3-
# Utility script which adds metadata headers to specified file(s)
4-
# Metadata is pulled from the clinical data dictionary web service (cdd)
5-
# Four lines added at the top (display name, dscriptions, datatype, priority)
6-
# Changes are only made if all input files are valid
3+
# Utility script which adds metadata headers to the specified clinical file(s).
4+
#
5+
# Metadata is loaded exclusively from a JSON file (-i / --independent-metadata-file)
6+
# that maps normalized column header names to their metadata objects. Each entry
7+
# in the JSON file must have the following keys:
8+
# DISPLAY_NAME, DESCRIPTIONS, DATATYPE, ATTRIBUTE_TYPE, PRIORITY
9+
#
10+
# Default behavior for attributes not found in the JSON file:
11+
# DISPLAY_NAME : attribute name with underscores replaced by spaces, title-cased
12+
# DESCRIPTIONS : same as DISPLAY_NAME default
13+
# DATATYPE : STRING
14+
# ATTRIBUTE_TYPE: SAMPLE
15+
# PRIORITY : 1
16+
#
17+
# The script rewrites each input file in-place, prepending four metadata header
18+
# lines (display name, description, datatype, priority). A fifth attribute_type
19+
# header line is included only when the file is not a standard patient or sample
20+
# clinical file (i.e., not data_clinical_patient.txt or data_clinical_sample.txt).
21+
#
22+
# Changes are only made if all input files are valid (exist and are writable).
723
# ------------------------------------------------------------------------------
824

925
from clinicalfile_utils import write_data, write_header_line, get_header
1026
import argparse
1127
import json
1228
import os
13-
import requests
1429
import shutil
1530
import sys
1631
import tempfile
@@ -24,35 +39,24 @@
2439
COLUMN_HEADER_KEY = 'column_header'
2540
ATTRIBUTE_TYPE_KEY = 'attribute_type'
2641
PRIORITY_KEY = 'priority'
27-
OVERRIDDEN_STUDY_NAME_KEY = 'name'
28-
DEFAULT_URL = "https://cdd.cbioportal.mskcc.org/api/"
2942

3043
PATIENT_CLINICAL_FILE_PATTERN = "data_clinical_patient.txt"
3144
SAMPLE_CLINICAL_FILE_PATTERN = "data_clinical_sample.txt"
3245

33-
def check_valid_studyid(study_id, base_cdd_url):
34-
query = base_cdd_url + "cancerStudies"
35-
response = requests.get(query)
36-
response_as_json = json.loads(response.text)
37-
if study_id not in [overridden_study[OVERRIDDEN_STUDY_NAME_KEY] for overridden_study in response_as_json]:
38-
print >> ERROR_FILE, 'Invalid study id: ' + study_id + ", there is no associated attributes in CDD"
39-
sys.exit(2)
40-
41-
def response_is_200(response):
42-
if response.status_code != 200:
43-
return False
44-
return True
46+
def get_metadata_dictionary(all_attributes, metadata_file):
47+
"""Load metadata for the given attributes from the JSON metadata file.
4548
46-
def get_independently_determined_attributes_metadata_dictionary(all_attributes, independent_metadata_file):
49+
Returns a dict mapping normalized column header -> metadata object for
50+
every attribute that appears in the file. Attributes absent from the file
51+
are omitted and will receive default values at write time.
52+
"""
4753
metadata_dictionary = {}
48-
if not independent_metadata_file:
49-
return metadata_dictionary
50-
f = open(independent_metadata_file, "r")
51-
independently_determined_attributes = json.load(f)
54+
f = open(metadata_file, "r")
55+
all_metadata = json.load(f)
5256
f.close()
5357
for normalized_column_header in all_attributes:
54-
if normalized_column_header in independently_determined_attributes:
55-
metadata = independently_determined_attributes[normalized_column_header]
58+
if normalized_column_header in all_metadata:
59+
metadata = all_metadata[normalized_column_header]
5660
metadata_dictionary[normalized_column_header] = {
5761
'DISPLAY_NAME' : metadata['DISPLAY_NAME'],
5862
'DESCRIPTIONS' : metadata['DESCRIPTIONS'],
@@ -62,49 +66,6 @@ def get_independently_determined_attributes_metadata_dictionary(all_attributes,
6266
}
6367
return metadata_dictionary
6468

65-
def add_clinical_attribute_metadata_from_cdd(study_id, header, base_cdd_url, metadata_mapping):
66-
response = requests.post(base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url, json=list(header))
67-
# ROB : we can probably delete the following block, unless we want to try to succed even when our independently determined metadata is missing a needed attribute
68-
# ROB : or change this to a graceful exit reporting the missing headers
69-
# ROB VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
70-
if not response_is_200(response):
71-
for attr in header:
72-
x = [attr]
73-
response = requests.post(base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url, json=list(x))
74-
if not response_is_200(response):
75-
continue
76-
response_as_json = json.loads(response.text)
77-
for entry in response_as_json:
78-
normalized_column_header = entry[COLUMN_HEADER_KEY]
79-
display_name = entry[DISPLAY_NAME_KEY]
80-
description = entry[DESCRIPTION_KEY]
81-
datatype = entry[DATATYPE_KEY]
82-
attribute_type = entry[ATTRIBUTE_TYPE_KEY]
83-
priority = entry[PRIORITY_KEY]
84-
metadata_mapping[normalized_column_header] = {
85-
'DISPLAY_NAME' : display_name,
86-
'DESCRIPTIONS' : description,
87-
'DATATYPE' : datatype,
88-
'ATTRIBUTE_TYPE' : attribute_type,
89-
'PRIORITY' : priority}
90-
print metadata_mapping
91-
return metadata_mapping
92-
# ROB ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
93-
response_as_json = json.loads(response.text)
94-
for entry in response_as_json:
95-
normalized_column_header = entry[COLUMN_HEADER_KEY]
96-
display_name = entry[DISPLAY_NAME_KEY]
97-
description = entry[DESCRIPTION_KEY]
98-
datatype = entry[DATATYPE_KEY]
99-
attribute_type = entry[ATTRIBUTE_TYPE_KEY]
100-
priority = entry[PRIORITY_KEY]
101-
metadata_mapping[normalized_column_header] = {
102-
'DISPLAY_NAME' : display_name,
103-
'DESCRIPTIONS' : description,
104-
'DATATYPE' : datatype,
105-
'ATTRIBUTE_TYPE' : attribute_type,
106-
'PRIORITY' : priority}
107-
10869
def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_types_format):
10970
name_line = []
11071
description_line = []
@@ -119,7 +80,7 @@ def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_t
11980
attribute_type_line.append(metadata_dictionary[attribute]['ATTRIBUTE_TYPE'])
12081
priority_line.append(metadata_dictionary[attribute]['PRIORITY'])
12182
else:
122-
# if attribute is not defined in cdd, use defaults
83+
# attribute not found in metadata file -- apply defaults
12384
name_line.append(attribute.replace("_", " ").title())
12485
description_line.append(attribute.replace("_", " ").title())
12586
datatype_line.append('STRING')
@@ -143,19 +104,13 @@ def check_if_mixed_attribute_types_format(filename):
143104
def main():
144105
parser = argparse.ArgumentParser()
145106
parser.add_argument("-f", "--files", nargs = "+", help = "file(s) to add metadata headers", required = True)
146-
parser.add_argument("-s", "--study-id", help = "study id for specific overrides", required = False)
147-
parser.add_argument("-c", "--cdd-url", help = "the url for the cdd web application, default is https://cdd.cbioportal.mskcc.org/api/", required = False)
148-
parser.add_argument("-i", "--independent-metadata-file", help = "a file containing a json map from normalized_header to metadata object", required = False)
107+
parser.add_argument("-i", "--independent-metadata-file", help = "a JSON file containing a map from normalized column header to metadata object", required = True)
149108
args = parser.parse_args()
150109
clinical_files = args.files
151-
study_id = args.study_id
152-
cdd_url = args.cdd_url
153-
independent_metadata_file = args.independent_metadata_file
154-
# change base url if specified (i.e for testing)
155-
if cdd_url:
156-
base_cdd_url = cdd_url
157-
else:
158-
base_cdd_url = DEFAULT_URL
110+
metadata_file = args.independent_metadata_file
111+
if not os.path.exists(metadata_file):
112+
print >> ERROR_FILE, 'Metadata file not found: ' + metadata_file
113+
sys.exit(2)
159114
# check file (args) validity and return error if any file fails check
160115
missing_clinical_files = [clinical_file for clinical_file in clinical_files if not os.path.exists(clinical_file)]
161116
if len(missing_clinical_files) > 0:
@@ -165,23 +120,15 @@ def main():
165120
if len(not_writable_clinical_files) > 0:
166121
print >> ERROR_FILE, 'File(s) not writable: ' + ', '.join(not_writable_clinical_files)
167122
sys.exit(2)
168-
if (study_id):
169-
check_valid_studyid(study_id, base_cdd_url)
170123
all_attributes = set()
171124
# get a set of attributes used across all input files
172125
for clinical_file in clinical_files:
173126
all_attributes = all_attributes.union(get_header(clinical_file))
174-
# set metadata for independently determined attributes which are members of all_attributes
175-
metadata_dictionary = get_independently_determined_attributes_metadata_dictionary(all_attributes, independent_metadata_file)
176-
# get a set of "to be determined by ddp" attributes
177-
ddp_dependent_attributes = set()
178-
for attribute in all_attributes:
179-
if not attribute in metadata_dictionary:
180-
ddp_dependent_attributes.add(attribute)
181-
add_clinical_attribute_metadata_from_cdd(study_id, ddp_dependent_attributes, base_cdd_url, metadata_dictionary)
182-
# check metadata is defined for all attributes in CDD
183-
if len(metadata_dictionary.keys()) != len(all_attributes):
184-
print >> ERROR_FILE, 'Error, metadata not found for attribute(s): ' + ', '.join(all_attributes.difference(metadata_dictionary.keys()))
127+
# load metadata for all attributes from the JSON file; missing attributes get defaults at write time
128+
metadata_dictionary = get_metadata_dictionary(all_attributes, metadata_file)
129+
missing_attributes = all_attributes.difference(metadata_dictionary.keys())
130+
if missing_attributes:
131+
print >> ERROR_FILE, 'Warning: metadata not found for attribute(s), defaults will be used: ' + ', '.join(missing_attributes)
185132
for clinical_file in clinical_files:
186133
# create temp file to write to
187134
temp_file, temp_file_name = tempfile.mkstemp()

0 commit comments

Comments
 (0)