diff --git a/import-scripts/expand-clinical-data.py b/import-scripts/expand-clinical-data.py index 55a2154e1..2f9d26c68 100644 --- a/import-scripts/expand-clinical-data.py +++ b/import-scripts/expand-clinical-data.py @@ -30,9 +30,9 @@ def expand_clinical_data_main(clinical_filename, fields, impact_data_only, ident continue # update line with supplemental sample clinical data and format data as string for output file line.update(SUPPLEMENTAL_CLINICAL_DATA.get(line[identifier_column_name].strip(), {})) - data = map(lambda v: line.get(v,''), header) + data = map(lambda v: line.get(v,'Unknown'), header) output_data.append('\t'.join(data)) - data_file.close() + data_file.close() # write data to output file output_file = open(clinical_filename, 'w') @@ -49,6 +49,9 @@ def load_supplemental_clinical_data(supplemental_clinical_filename, supplemental line = dict(zip(header, map(str.strip, line.split('\t')))) if study_id == 'genie' and identifier_column_name == 'SAMPLE_ID': normalize_genie_sample_type(line) + add_gene_panel_prefix(line) + elif study_id == 'genie' and identifier_column_name == 'PATIENT_ID': + line = dict((k,('Unknown' if v=='NA' else v)) for k,v in line.items()) SUPPLEMENTAL_CLINICAL_DATA[line[identifier_column_name].strip()] = dict({(k,v) for k,v in line.items() if k in supplemental_fields}) data_file.close() @@ -61,6 +64,13 @@ def normalize_genie_sample_type(data): except KeyError: print >> ERROR_FILE, "No SAMPLE_TYPE column detected, cannot normalize genie sample type" return data + +def add_gene_panel_prefix(data): + try: + data['GENE_PANEL'] = 'MSK-'+data['GENE_PANEL'] + except KeyError: + print >> ERROR_FILE, "No GENE_PANEL column detected, cannot add prefix to genie gene panel" + return data def is_impact_sample_or_patient(case_identifier): """ Determine whether sample id is from IMPACT """ diff --git a/import-scripts/generate-clinical-subset.py b/import-scripts/generate-clinical-subset.py index 7286bdd9e..033b3ee3c 100755 --- a/import-scripts/generate-clinical-subset.py +++ b/import-scripts/generate-clinical-subset.py @@ -121,11 +121,20 @@ def filter_patient_clinical_data(clin_patient_file, study_id): data_file = open(clin_patient_file, 'rU') data_reader = [line for line in data_file.readlines() if not line.startswith('#')][1:] output_data = ['\t'.join(header)] + + SUPPLEMENTAL_PATIENT_CLINICAL_DATA = {} for line in data_reader: data = dict(zip(header, map(str.strip, line.split('\t')))) - if not data['PATIENT_ID'] in FILTERED_PATIENT_IDS: - continue - formatted_data = map(lambda v: data.get(v,''), header) + if data['PATIENT_ID'] in FILTERED_PATIENT_IDS: SUPPLEMENTAL_PATIENT_CLINICAL_DATA[data['PATIENT_ID']] = data + + for patient_id in FILTERED_PATIENT_IDS: + if not patient_id in SUPPLEMENTAL_PATIENT_CLINICAL_DATA: + SUPPLEMENTAL_PATIENT_CLINICAL_DATA[patient_id] = dict(zip(header, map(lambda v: (patient_id if v=='PATIENT_ID' else 'NA'), header))) + + normalized_supplemental_patient_data = normalize_genie_patient_attributes(SUPPLEMENTAL_PATIENT_CLINICAL_DATA) + + for patient in normalized_supplemental_patient_data: + formatted_data = map(lambda v: normalized_supplemental_patient_data[patient].get(v,''), header) output_data.append('\t'.join(formatted_data)) data_file.close() @@ -135,6 +144,23 @@ def filter_patient_clinical_data(clin_patient_file, study_id): output_file.close() print >> OUTPUT_FILE, 'Input patient clinical data filtered by patient id for study: ' + study_id +def normalize_genie_patient_attributes(data): + for patient_id in data: + if 'NAACCR_SEX_CODE' in data[patient_id] and data[patient_id]['NAACCR_SEX_CODE'].strip() == 'NA': + data[patient_id]['NAACCR_SEX_CODE'] = '9' + if 'NAACCR_RACE_CODE_PRIMARY' in data[patient_id] and data[patient_id]['NAACCR_RACE_CODE_PRIMARY'].strip() == 'NA': + data[patient_id]['NAACCR_RACE_CODE_PRIMARY'] = '99' + if 'NAACCR_RACE_CODE_SECONDARY' in data[patient_id] and data[patient_id]['NAACCR_RACE_CODE_SECONDARY'].strip() == 'NA': + data[patient_id]['NAACCR_RACE_CODE_SECONDARY'] = '99' + if 'NAACCR_RACE_CODE_TERTIARY' in data[patient_id] and data[patient_id]['NAACCR_RACE_CODE_TERTIARY'].strip() == 'NA': + data[patient_id]['NAACCR_RACE_CODE_TERTIARY'] = '99' + if 'NAACCR_ETHNICITY_CODE' in data[patient_id] and data[patient_id]['NAACCR_ETHNICITY_CODE'].strip() == 'NA': + data[patient_id]['NAACCR_ETHNICITY_CODE'] = '9' + if 'BIRTH_YEAR' in data[patient_id] and data[patient_id]['BIRTH_YEAR'].strip() == 'NA': + data[patient_id]['BIRTH_YEAR'] = 'Unknown' + return data + + def generate_sample_subset_file(subset_filename): """ Writes subset of sample ids to output directory. """ output_file = open(subset_filename, 'w') diff --git a/import-scripts/merge-cna-records.py b/import-scripts/merge-cna-records.py new file mode 100644 index 000000000..e3a8c2cbe --- /dev/null +++ b/import-scripts/merge-cna-records.py @@ -0,0 +1,100 @@ +import sys +import os +import optparse + +ERROR_FILE = sys.stderr +OUTPUT_FILE = sys.stdout + +HEADER_KEYWORDS = ['hugo_symbol','entrez_gene_id'] +GENE_MERGE_LIST = {'CDKN2Ap16INK4A': 'CDKN2A', +'CDKN2Ap14ARF': 'CDKN2A', +'MLL': 'KMT2A', +'MLL2': 'KMT2D', +'MLL3': 'KMT2C', +'MLL4': 'KMT2B', +'FAM123B': 'AMER1', +'MYCL1': 'MYCL'} + +def get_file_header(filename): + """ Returns the file header. """ + data_file = open(filename, 'rU') + filedata = [x for x in data_file.readlines() if not x.startswith('#')] + header = map(str.strip, filedata[0].split('\t')) + data_file.close() + return header + +def merge_duplicate_cna_records(data): + for gene in data: + if len(data[gene]) > 1: + merge_status = 0 + cna_data = map(lambda v: set(v), zip(*data[gene])) + merged_cna_data = [] + for value in cna_data: + if len(value) > 1: + value = value - set(['NA']) + if len(value) > 1: value = value - set(['']) + if len(value) > 1: value = value - set(['0']) + if len(value) > 1: + if len(value) == 2 and '-1.5' in value and '-2' in value: value.remove('-1.5') + else: merge_status = 1; break + merged_cna_data.append(map(str,value)) + else: + merged_cna_data.append(map(str,value)) + if merge_status == 1: + print >> ERROR_FILE, "The copy number values for gene", gene, "cannot be merged" + else: + merged_cna_data = [value[0] for value in merged_cna_data] + data[gene] = [merged_cna_data] + return(data) + +def write_merged_cna_data(data,header,out_cna_filepath): + unmerged_data = "" + merged_data = "" + + for gene_symbol in data: + if len(data[gene_symbol]) > 1: + for record in data[gene_symbol]: + unmerged_data += gene_symbol+'\t'+'\t'.join(record)+'\n' + else: + merged_data += gene_symbol+'\t'+'\t'.join(data[gene_symbol][0])+'\n' + + if unmerged_data != "": + unmerged_file = open(out_cna_filepath+'data_CNA_unmerged.txt','w') + unmerged_file.write('\t'.join(header)+'\n') + unmerged_file.write(unmerged_data) + print >> OUTPUT_FILE, "The unmerged CNA records are written to :", out_cna_filepath+'data_CNA_unmerged.txt' + if merged_data != "": + merged_file = open(out_cna_filepath+'data_CNA_merged.txt','w') + merged_file.write('\t'.join(header)+'\n') + merged_file.write(merged_data) + print >> OUTPUT_FILE, "The merged CNA records are written to :", out_cna_filepath+'data_CNA_merged.txt' + +def main(): + # get command line arguments + parser = optparse.OptionParser() + parser.add_option('-i', '--input-cnafile', action = 'store', dest = 'cnafile') + parser.add_option('-o', '--output-cna-filepath', action = 'store', dest = 'out_cna_filepath') + + (options, args) = parser.parse_args() + cna_filename = options.cnafile + out_cna_filepath = options.out_cna_filepath + + header = get_file_header(cna_filename) + + # load data from clinical_filename and write data to output directory + data_file = open(cna_filename, 'rU') + data_reader = [line for line in data_file.readlines() if not line.startswith('#')][1:] + + COPY_NUMBER_DATA = {} + for line in data_reader: + line = line.strip('\n').split('\t') + if line[0] in GENE_MERGE_LIST: line[0] = GENE_MERGE_LIST[line[0]] + if line[0] not in COPY_NUMBER_DATA: COPY_NUMBER_DATA[line[0]] = [line[1:]] + else: COPY_NUMBER_DATA[line[0]].append(line[1:]) + + data = merge_duplicate_cna_records(COPY_NUMBER_DATA) + + write_merged_cna_data(data,header,out_cna_filepath) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/import-scripts/remove-duplicate-maf-variants.py b/import-scripts/remove-duplicate-maf-variants.py new file mode 100644 index 000000000..4328ce46d --- /dev/null +++ b/import-scripts/remove-duplicate-maf-variants.py @@ -0,0 +1,84 @@ +import sys +import os +import optparse + +# Script to remove duplicate maf records based on the 8 key columns. +# Calculates VAF for each record and picks the record with high VAF +# Formula for VAF = t_alt_count / (t_ref_count + t_alt_count) + +ERROR_FILE = sys.stderr +OUTPUT_FILE = sys.stdout + +KEY_COLUMNS_INDEX = [] +KEY_COLUMNS = ['Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Variant_Classification','Tumor_Seq_Allele2','Tumor_Sample_Barcode','HGVSp_Short'] +MAF_DATA = {} + +def remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index): + outfile = [] + outfile.append(comments) + outfile.append(header) + for key in MAF_DATA: + if len(MAF_DATA[key]) > 1: + vaf_ind = 0 + vaf_value = 0 + for val in MAF_DATA[key]: + #calculate VAF for each duplicate record. + columns = val.rstrip('\n').split('\t') + try: + VAF = int(columns[t_altc_index])/(int(columns[t_altc_index])+int(columns[t_refc_index])) + if VAF > vaf_value: + vaf_value = VAF + vaf_ind = MAF_DATA[key].index(val) + outfile.append(MAF_DATA[key][vaf_ind]) + except: + print >> ERROR_FILE, 'ERROR: VAF cannot be calculated for the variant : ' + key + print >> ERROR_FILE, 'The t_ref_count is: '+ columns[t_refc_index]+ ' and t_alt_count is: '+ columns[t_altc_index] + outfile.append(val) + else: + outfile.append(MAF_DATA[key][0]) + + datafile = open(maf_filename, 'w') + for line in outfile: + datafile.write(line) + datafile.close() + print >> OUTPUT_FILE, 'MAF file with duplicate variants removed is written to: ' + maf_filename + + +def main(): + # get command line arguments + parser = optparse.OptionParser() + parser.add_option('-i', '--input-maf-file', action = 'store', dest = 'maf_file') + + (options, args) = parser.parse_args() + maf_filename = options.maf_file + + comments = "" + header = "" + + with open(maf_filename,'r') as maf_file: + for line in maf_file: + if line.startswith('#'): + comments += line + elif line.startswith('Hugo_Symbol'): + header += line + header_cols = line.rstrip('\n').split('\t') + #get the positions of the 8 key maf columns + for value in KEY_COLUMNS: + KEY_COLUMNS_INDEX.append(header_cols.index(value)) + t_refc_index = header_cols.index('t_ref_count') + t_altc_index = header_cols.index('t_alt_count') + else: + reference_key = "" + data = line.rstrip('\n').split('\t') + for index in KEY_COLUMNS_INDEX: + reference_key += data[index]+'\t' + reference_key = reference_key.rstrip('\t') + if reference_key not in MAF_DATA: + MAF_DATA[reference_key] = [line] + else: + MAF_DATA[reference_key].append(line) + + remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index) + +if __name__ == '__main__': + main() diff --git a/import-scripts/subset-impact-data.sh b/import-scripts/subset-impact-data.sh index ebe168528..1f5c71934 100755 --- a/import-scripts/subset-impact-data.sh +++ b/import-scripts/subset-impact-data.sh @@ -87,8 +87,7 @@ if [ $STUDY_ID == "genie" ]; then else # starting in Nov 2018 releases, all vital status information will be removed from patient file and placed in a separate file: # get the patients from the filtered set of patients from the generate-clinical-subset.py call and expand file with the vital status columns - cut -f1 $OUTPUT_DIRECTORY/data_clinical_supp_patient.txt > $OUTPUT_DIRECTORY/vital_status.txt - $PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/expand-clinical-data.py --study-id="genie" --clinical-file="$OUTPUT_DIRECTORY/vital_status.txt" --clinical-supp-file="$INPUT_DIRECTORY/ddp/ddp_vital_status.txt" --fields="YEAR_CONTACT,YEAR_DEATH,INT_CONTACT,INT_DOD,DEAD" --identifier-column-name="PATIENT_ID" + $PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/expand-clinical-data.py --study-id="genie" --clinical-file="$OUTPUT_DIRECTORY/data_clinical_supp_patient.txt" --clinical-supp-file="$INPUT_DIRECTORY/ddp/ddp_vital_status.txt" --fields="YEAR_CONTACT,YEAR_DEATH,INT_CONTACT,INT_DOD,DEAD" --identifier-column-name="PATIENT_ID" if [ $? -gt 0 ] ; then echo "Failed to expand $OUTPUT_DIRECTORY/vital_status.txt with YEAR_CONTACT, YEAR_DEATH, INT_CONTACT, INT_DOD, DEAD from $INPUT_DIRECTORY/ddp/ddp_vital_status.txt. Exiting..." exit 2 @@ -106,7 +105,7 @@ if [ $STUDY_ID == "genie" ]; then $PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/add-age-at-seq-report.py --clinical-file="$OUTPUT_DIRECTORY/data_clinical_supp_sample.txt" --seq-date-file="$INPUT_DIRECTORY/cvr/seq_date.txt" --age-file="$INPUT_DIRECTORY/ddp/ddp_age.txt" --convert-to-days="true" if [ $? -gt 0 ] ; then echo "Failed to add AGE_AT_SEQ_REPORT to $OUTPUT_DIRECTORY/data_clinical_supp_sample.txt using $INPUT_DIRECTORY/cvr/seq_date.txt. Exiting..." - exit 2l + exit 2 fi # rename GENE_PANEL to SEQ_ASSAY_ID in data_clinical_supp_sample.txt @@ -123,6 +122,20 @@ if [ $STUDY_ID == "genie" ]; then # remove germline mutations from maf grep -v 'GERMLINE' $OUTPUT_DIRECTORY/data_mutations_extended.txt > $OUTPUT_DIRECTORY/data_mutations_extended.txt.tmp mv $OUTPUT_DIRECTORY/data_mutations_extended.txt.tmp $OUTPUT_DIRECTORY/data_mutations_extended.txt + fi + + # merge duplicate variants from maf + $PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/remove-duplicate-maf-variants.py --input-maf-file="$OUTPUT_DIRECTORY/data_mutations_extended.txt" + if [ $? -gt 0 ] ; then + echo "Failed to merge duplicate CNA records in $OUTPUT_DIRECTORY/data_CNA.txt. Exiting.." + exit 2 + fi + + # merge CNA records for certain gene duplicates + $PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/merge-cna-records.py --input-cnafile="$OUTPUT_DIRECTORY/data_CNA.txt" --output-cna-filepath="$OUTPUT_DIRECTORY/" + if [ $? -gt 0 ] ; then + echo "Failed to merge duplicate CNA records in $OUTPUT_DIRECTORY/data_CNA.txt. Exiting.." + exit 2 fi fi else