Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 40 additions & 94 deletions import-scripts/add_clinical_attribute_metadata_headers.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
#!/usr/bin/env python
# ------------------------------------------------------------------------------
# Utility script which adds metadata headers to specified file(s)
# Metadata is pulled from the clinical data dictionary web service (cdd)
# Four lines added at the top (display name, dscriptions, datatype, priority)
# Changes are only made if all input files are valid
# Utility script which adds metadata headers to the specified clinical file(s).
#
# Metadata is loaded exclusively from a JSON file (-i / --independent-metadata-file)
# that maps normalized column header names to their metadata objects. Each entry
# in the JSON file must have the following keys:
# DISPLAY_NAME, DESCRIPTIONS, DATATYPE, ATTRIBUTE_TYPE, PRIORITY
#
# Default behavior for attributes not found in the JSON file:
# DISPLAY_NAME : attribute name with underscores replaced by spaces, title-cased
# DESCRIPTIONS : same as DISPLAY_NAME default
# DATATYPE : STRING
# ATTRIBUTE_TYPE: SAMPLE
# PRIORITY : 1
#
# The script adds four/five metadata headers lines (display name, description, datatype, priority, attribute type).
# The fifth line (attribute type) is added when the file is not split between patient/sample, based on the
# clinical file name (i.e., not data_clinical_patient.txt or data_clinical_sample.txt).
#
# Changes are only made if all input files are valid (exist and are writable).
# ------------------------------------------------------------------------------

from clinicalfile_utils import write_data, write_header_line, get_header
import argparse
import json
import os
import requests
import shutil
import sys
import tempfile
Expand All @@ -24,35 +38,24 @@
COLUMN_HEADER_KEY = 'column_header'
ATTRIBUTE_TYPE_KEY = 'attribute_type'
PRIORITY_KEY = 'priority'
OVERRIDDEN_STUDY_NAME_KEY = 'name'
DEFAULT_URL = "https://cdd.cbioportal.mskcc.org/api/"

PATIENT_CLINICAL_FILE_PATTERN = "data_clinical_patient.txt"
SAMPLE_CLINICAL_FILE_PATTERN = "data_clinical_sample.txt"

def check_valid_studyid(study_id, base_cdd_url):
query = base_cdd_url + "cancerStudies"
response = requests.get(query)
response_as_json = json.loads(response.text)
if study_id not in [overridden_study[OVERRIDDEN_STUDY_NAME_KEY] for overridden_study in response_as_json]:
print >> ERROR_FILE, 'Invalid study id: ' + study_id + ", there is no associated attributes in CDD"
sys.exit(2)

def response_is_200(response):
if response.status_code != 200:
return False
return True
def get_metadata_dictionary(all_attributes, metadata_file):
"""Load metadata for the given attributes from the JSON metadata file.

def get_independently_determined_attributes_metadata_dictionary(all_attributes, independent_metadata_file):
Returns a dict mapping normalized column header -> metadata object for
every attribute that appears in the file. Attributes absent from the file
are omitted and will receive default values at write time.
"""
metadata_dictionary = {}
if not independent_metadata_file:
return metadata_dictionary
f = open(independent_metadata_file, "r")
independently_determined_attributes = json.load(f)
f = open(metadata_file, "r")
all_metadata = json.load(f)
f.close()
for normalized_column_header in all_attributes:
if normalized_column_header in independently_determined_attributes:
metadata = independently_determined_attributes[normalized_column_header]
if normalized_column_header in all_metadata:
metadata = all_metadata[normalized_column_header]
metadata_dictionary[normalized_column_header] = {
'DISPLAY_NAME' : metadata['DISPLAY_NAME'],
'DESCRIPTIONS' : metadata['DESCRIPTIONS'],
Expand All @@ -62,49 +65,6 @@ def get_independently_determined_attributes_metadata_dictionary(all_attributes,
}
return metadata_dictionary

def add_clinical_attribute_metadata_from_cdd(study_id, header, base_cdd_url, metadata_mapping):
response = requests.post(base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url, json=list(header))
# ROB : we can probably delete the following block, unless we want to try to succed even when our independently determined metadata is missing a needed attribute
# ROB : or change this to a graceful exit reporting the missing headers
# ROB VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
if not response_is_200(response):
for attr in header:
x = [attr]
response = requests.post(base_cdd_url + "?cancerStudy=" + study_id if study_id else base_cdd_url, json=list(x))
if not response_is_200(response):
continue
response_as_json = json.loads(response.text)
for entry in response_as_json:
normalized_column_header = entry[COLUMN_HEADER_KEY]
display_name = entry[DISPLAY_NAME_KEY]
description = entry[DESCRIPTION_KEY]
datatype = entry[DATATYPE_KEY]
attribute_type = entry[ATTRIBUTE_TYPE_KEY]
priority = entry[PRIORITY_KEY]
metadata_mapping[normalized_column_header] = {
'DISPLAY_NAME' : display_name,
'DESCRIPTIONS' : description,
'DATATYPE' : datatype,
'ATTRIBUTE_TYPE' : attribute_type,
'PRIORITY' : priority}
print metadata_mapping
return metadata_mapping
# ROB ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
response_as_json = json.loads(response.text)
for entry in response_as_json:
normalized_column_header = entry[COLUMN_HEADER_KEY]
display_name = entry[DISPLAY_NAME_KEY]
description = entry[DESCRIPTION_KEY]
datatype = entry[DATATYPE_KEY]
attribute_type = entry[ATTRIBUTE_TYPE_KEY]
priority = entry[PRIORITY_KEY]
metadata_mapping[normalized_column_header] = {
'DISPLAY_NAME' : display_name,
'DESCRIPTIONS' : description,
'DATATYPE' : datatype,
'ATTRIBUTE_TYPE' : attribute_type,
'PRIORITY' : priority}

def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_types_format):
name_line = []
description_line = []
Expand All @@ -119,7 +79,7 @@ def write_headers(header, metadata_dictionary, output_file, is_mixed_attribute_t
attribute_type_line.append(metadata_dictionary[attribute]['ATTRIBUTE_TYPE'])
priority_line.append(metadata_dictionary[attribute]['PRIORITY'])
else:
# if attribute is not defined in cdd, use defaults
# attribute not found in metadata file -- apply defaults
name_line.append(attribute.replace("_", " ").title())
description_line.append(attribute.replace("_", " ").title())
datatype_line.append('STRING')
Expand All @@ -143,19 +103,13 @@ def check_if_mixed_attribute_types_format(filename):
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--files", nargs = "+", help = "file(s) to add metadata headers", required = True)
parser.add_argument("-s", "--study-id", help = "study id for specific overrides", required = False)
parser.add_argument("-c", "--cdd-url", help = "the url for the cdd web application, default is https://cdd.cbioportal.mskcc.org/api/", required = False)
parser.add_argument("-i", "--independent-metadata-file", help = "a file containing a json map from normalized_header to metadata object", required = False)
parser.add_argument("-i", "--independent-metadata-file", help = "a JSON file containing a map from normalized column header to metadata object", required = True)
args = parser.parse_args()
clinical_files = args.files
study_id = args.study_id
cdd_url = args.cdd_url
independent_metadata_file = args.independent_metadata_file
# change base url if specified (i.e for testing)
if cdd_url:
base_cdd_url = cdd_url
else:
base_cdd_url = DEFAULT_URL
metadata_file = args.independent_metadata_file
if not os.path.exists(metadata_file):
print >> ERROR_FILE, 'Metadata file not found: ' + metadata_file
sys.exit(2)
# check file (args) validity and return error if any file fails check
missing_clinical_files = [clinical_file for clinical_file in clinical_files if not os.path.exists(clinical_file)]
if len(missing_clinical_files) > 0:
Expand All @@ -165,23 +119,15 @@ def main():
if len(not_writable_clinical_files) > 0:
print >> ERROR_FILE, 'File(s) not writable: ' + ', '.join(not_writable_clinical_files)
sys.exit(2)
if (study_id):
check_valid_studyid(study_id, base_cdd_url)
all_attributes = set()
# get a set of attributes used across all input files
for clinical_file in clinical_files:
all_attributes = all_attributes.union(get_header(clinical_file))
# set metadata for independently determined attributes which are members of all_attributes
metadata_dictionary = get_independently_determined_attributes_metadata_dictionary(all_attributes, independent_metadata_file)
# get a set of "to be determined by ddp" attributes
ddp_dependent_attributes = set()
for attribute in all_attributes:
if not attribute in metadata_dictionary:
ddp_dependent_attributes.add(attribute)
add_clinical_attribute_metadata_from_cdd(study_id, ddp_dependent_attributes, base_cdd_url, metadata_dictionary)
# check metadata is defined for all attributes in CDD
if len(metadata_dictionary.keys()) != len(all_attributes):
print >> ERROR_FILE, 'Error, metadata not found for attribute(s): ' + ', '.join(all_attributes.difference(metadata_dictionary.keys()))
# load metadata for all attributes from the JSON file; missing attributes get defaults at write time
metadata_dictionary = get_metadata_dictionary(all_attributes, metadata_file)
missing_attributes = all_attributes.difference(metadata_dictionary.keys())
if missing_attributes:
print >> ERROR_FILE, 'Warning: metadata not found for attribute(s), defaults will be used: ' + ', '.join(missing_attributes)
for clinical_file in clinical_files:
# create temp file to write to
temp_file, temp_file_name = tempfile.mkstemp()
Expand Down