Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions import-scripts/expand-clinical-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def expand_clinical_data_main(clinical_filename, fields, impact_data_only, ident
continue
# update line with supplemental sample clinical data and format data as string for output file
line.update(SUPPLEMENTAL_CLINICAL_DATA.get(line[identifier_column_name].strip(), {}))
data = map(lambda v: line.get(v,''), header)
data = map(lambda v: line.get(v,'Unknown'), header)
output_data.append('\t'.join(data))
data_file.close()
data_file.close()

# write data to output file
output_file = open(clinical_filename, 'w')
Expand All @@ -49,6 +49,9 @@ def load_supplemental_clinical_data(supplemental_clinical_filename, supplemental
line = dict(zip(header, map(str.strip, line.split('\t'))))
if study_id == 'genie' and identifier_column_name == 'SAMPLE_ID':
normalize_genie_sample_type(line)
add_gene_panel_prefix(line)
elif study_id == 'genie' and identifier_column_name == 'PATIENT_ID':
line = dict((k,('Unknown' if v=='NA' else v)) for k,v in line.items())
SUPPLEMENTAL_CLINICAL_DATA[line[identifier_column_name].strip()] = dict({(k,v) for k,v in line.items() if k in supplemental_fields})
data_file.close()

Expand All @@ -61,6 +64,13 @@ def normalize_genie_sample_type(data):
except KeyError:
print >> ERROR_FILE, "No SAMPLE_TYPE column detected, cannot normalize genie sample type"
return data

def add_gene_panel_prefix(data):
try:
data['GENE_PANEL'] = 'MSK-'+data['GENE_PANEL']
except KeyError:
print >> ERROR_FILE, "No GENE_PANEL column detected, cannot add prefix to genie gene panel"
return data

def is_impact_sample_or_patient(case_identifier):
""" Determine whether sample id is from IMPACT """
Expand Down
32 changes: 29 additions & 3 deletions import-scripts/generate-clinical-subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,20 @@ def filter_patient_clinical_data(clin_patient_file, study_id):
data_file = open(clin_patient_file, 'rU')
data_reader = [line for line in data_file.readlines() if not line.startswith('#')][1:]
output_data = ['\t'.join(header)]

SUPPLEMENTAL_PATIENT_CLINICAL_DATA = {}
for line in data_reader:
data = dict(zip(header, map(str.strip, line.split('\t'))))
if not data['PATIENT_ID'] in FILTERED_PATIENT_IDS:
continue
formatted_data = map(lambda v: data.get(v,''), header)
if data['PATIENT_ID'] in FILTERED_PATIENT_IDS: SUPPLEMENTAL_PATIENT_CLINICAL_DATA[data['PATIENT_ID']] = data

for patient_id in FILTERED_PATIENT_IDS:
if not patient_id in SUPPLEMENTAL_PATIENT_CLINICAL_DATA:
SUPPLEMENTAL_PATIENT_CLINICAL_DATA[patient_id] = dict(zip(header, map(lambda v: (patient_id if v=='PATIENT_ID' else 'NA'), header)))

normalized_supplemental_patient_data = normalize_genie_patient_attributes(SUPPLEMENTAL_PATIENT_CLINICAL_DATA)

for patient in normalized_supplemental_patient_data:
formatted_data = map(lambda v: normalized_supplemental_patient_data[patient].get(v,''), header)
output_data.append('\t'.join(formatted_data))
data_file.close()

Expand All @@ -135,6 +144,23 @@ def filter_patient_clinical_data(clin_patient_file, study_id):
output_file.close()
print >> OUTPUT_FILE, 'Input patient clinical data filtered by patient id for study: ' + study_id

def normalize_genie_patient_attributes(data):
for patient_id in data:
if 'NAACCR_SEX_CODE' in data[patient_id] and data[patient_id]['NAACCR_SEX_CODE'].strip() == 'NA':
data[patient_id]['NAACCR_SEX_CODE'] = '9'
if 'NAACCR_RACE_CODE_PRIMARY' in data[patient_id] and data[patient_id]['NAACCR_RACE_CODE_PRIMARY'].strip() == 'NA':
data[patient_id]['NAACCR_RACE_CODE_PRIMARY'] = '99'
if 'NAACCR_RACE_CODE_SECONDARY' in data[patient_id] and data[patient_id]['NAACCR_RACE_CODE_SECONDARY'].strip() == 'NA':
data[patient_id]['NAACCR_RACE_CODE_SECONDARY'] = '99'
if 'NAACCR_RACE_CODE_TERTIARY' in data[patient_id] and data[patient_id]['NAACCR_RACE_CODE_TERTIARY'].strip() == 'NA':
data[patient_id]['NAACCR_RACE_CODE_TERTIARY'] = '99'
if 'NAACCR_ETHNICITY_CODE' in data[patient_id] and data[patient_id]['NAACCR_ETHNICITY_CODE'].strip() == 'NA':
data[patient_id]['NAACCR_ETHNICITY_CODE'] = '9'
if 'BIRTH_YEAR' in data[patient_id] and data[patient_id]['BIRTH_YEAR'].strip() == 'NA':
data[patient_id]['BIRTH_YEAR'] = 'Unknown'
return data


def generate_sample_subset_file(subset_filename):
""" Writes subset of sample ids to output directory. """
output_file = open(subset_filename, 'w')
Expand Down
100 changes: 100 additions & 0 deletions import-scripts/merge-cna-records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import sys
import os
import optparse

ERROR_FILE = sys.stderr
OUTPUT_FILE = sys.stdout

HEADER_KEYWORDS = ['hugo_symbol','entrez_gene_id']
GENE_MERGE_LIST = {'CDKN2Ap16INK4A': 'CDKN2A',
'CDKN2Ap14ARF': 'CDKN2A',
'MLL': 'KMT2A',
'MLL2': 'KMT2D',
'MLL3': 'KMT2C',
'MLL4': 'KMT2B',
'FAM123B': 'AMER1',
'MYCL1': 'MYCL'}

def get_file_header(filename):
""" Returns the file header. """
data_file = open(filename, 'rU')
filedata = [x for x in data_file.readlines() if not x.startswith('#')]
header = map(str.strip, filedata[0].split('\t'))
data_file.close()
return header

def merge_duplicate_cna_records(data):
for gene in data:
if len(data[gene]) > 1:
merge_status = 0
cna_data = map(lambda v: set(v), zip(*data[gene]))
merged_cna_data = []
for value in cna_data:
if len(value) > 1:
value = value - set(['NA'])
if len(value) > 1: value = value - set([''])
if len(value) > 1: value = value - set(['0'])
if len(value) > 1:
if len(value) == 2 and '-1.5' in value and '-2' in value: value.remove('-1.5')
else: merge_status = 1; break
merged_cna_data.append(map(str,value))
else:
merged_cna_data.append(map(str,value))
if merge_status == 1:
print >> ERROR_FILE, "The copy number values for gene", gene, "cannot be merged"
else:
merged_cna_data = [value[0] for value in merged_cna_data]
data[gene] = [merged_cna_data]
return(data)

def write_merged_cna_data(data,header,out_cna_filepath):
unmerged_data = ""
merged_data = ""

for gene_symbol in data:
if len(data[gene_symbol]) > 1:
for record in data[gene_symbol]:
unmerged_data += gene_symbol+'\t'+'\t'.join(record)+'\n'
else:
merged_data += gene_symbol+'\t'+'\t'.join(data[gene_symbol][0])+'\n'

if unmerged_data != "":
unmerged_file = open(out_cna_filepath+'data_CNA_unmerged.txt','w')
unmerged_file.write('\t'.join(header)+'\n')
unmerged_file.write(unmerged_data)
print >> OUTPUT_FILE, "The unmerged CNA records are written to :", out_cna_filepath+'data_CNA_unmerged.txt'
if merged_data != "":
merged_file = open(out_cna_filepath+'data_CNA_merged.txt','w')
merged_file.write('\t'.join(header)+'\n')
merged_file.write(merged_data)
print >> OUTPUT_FILE, "The merged CNA records are written to :", out_cna_filepath+'data_CNA_merged.txt'

def main():
# get command line arguments
parser = optparse.OptionParser()
parser.add_option('-i', '--input-cnafile', action = 'store', dest = 'cnafile')
parser.add_option('-o', '--output-cna-filepath', action = 'store', dest = 'out_cna_filepath')

(options, args) = parser.parse_args()
cna_filename = options.cnafile
out_cna_filepath = options.out_cna_filepath

header = get_file_header(cna_filename)

# load data from clinical_filename and write data to output directory
data_file = open(cna_filename, 'rU')
data_reader = [line for line in data_file.readlines() if not line.startswith('#')][1:]

COPY_NUMBER_DATA = {}
for line in data_reader:
line = line.strip('\n').split('\t')
if line[0] in GENE_MERGE_LIST: line[0] = GENE_MERGE_LIST[line[0]]
if line[0] not in COPY_NUMBER_DATA: COPY_NUMBER_DATA[line[0]] = [line[1:]]
else: COPY_NUMBER_DATA[line[0]].append(line[1:])

data = merge_duplicate_cna_records(COPY_NUMBER_DATA)

write_merged_cna_data(data,header,out_cna_filepath)

if __name__ == '__main__':
main()
84 changes: 84 additions & 0 deletions import-scripts/remove-duplicate-maf-variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import sys
import os
import optparse

# Script to remove duplicate maf records based on the 8 key columns.
# Calculates VAF for each record and picks the record with high VAF
# Formula for VAF = t_alt_count / (t_ref_count + t_alt_count)

ERROR_FILE = sys.stderr
OUTPUT_FILE = sys.stdout

KEY_COLUMNS_INDEX = []
KEY_COLUMNS = ['Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Variant_Classification','Tumor_Seq_Allele2','Tumor_Sample_Barcode','HGVSp_Short']
MAF_DATA = {}

def remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index):
outfile = []
outfile.append(comments)
outfile.append(header)
for key in MAF_DATA:
if len(MAF_DATA[key]) > 1:
vaf_ind = 0
vaf_value = 0
for val in MAF_DATA[key]:
#calculate VAF for each duplicate record.
columns = val.rstrip('\n').split('\t')
try:
VAF = int(columns[t_altc_index])/(int(columns[t_altc_index])+int(columns[t_refc_index]))
if VAF > vaf_value:
vaf_value = VAF
vaf_ind = MAF_DATA[key].index(val)
outfile.append(MAF_DATA[key][vaf_ind])
except:
print >> ERROR_FILE, 'ERROR: VAF cannot be calculated for the variant : ' + key
print >> ERROR_FILE, 'The t_ref_count is: '+ columns[t_refc_index]+ ' and t_alt_count is: '+ columns[t_altc_index]
outfile.append(val)
else:
outfile.append(MAF_DATA[key][0])

datafile = open(maf_filename, 'w')
for line in outfile:
datafile.write(line)
datafile.close()
print >> OUTPUT_FILE, 'MAF file with duplicate variants removed is written to: ' + maf_filename


def main():
# get command line arguments
parser = optparse.OptionParser()
parser.add_option('-i', '--input-maf-file', action = 'store', dest = 'maf_file')

(options, args) = parser.parse_args()
maf_filename = options.maf_file

comments = ""
header = ""

with open(maf_filename,'r') as maf_file:
for line in maf_file:
if line.startswith('#'):
comments += line
elif line.startswith('Hugo_Symbol'):
header += line
header_cols = line.rstrip('\n').split('\t')
#get the positions of the 8 key maf columns
for value in KEY_COLUMNS:
KEY_COLUMNS_INDEX.append(header_cols.index(value))
t_refc_index = header_cols.index('t_ref_count')
t_altc_index = header_cols.index('t_alt_count')
else:
reference_key = ""
data = line.rstrip('\n').split('\t')
for index in KEY_COLUMNS_INDEX:
reference_key += data[index]+'\t'
reference_key = reference_key.rstrip('\t')
if reference_key not in MAF_DATA:
MAF_DATA[reference_key] = [line]
else:
MAF_DATA[reference_key].append(line)

remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index)

if __name__ == '__main__':
main()
19 changes: 16 additions & 3 deletions import-scripts/subset-impact-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ if [ $STUDY_ID == "genie" ]; then
else
# starting in Nov 2018 releases, all vital status information will be removed from patient file and placed in a separate file:
# get the patients from the filtered set of patients from the generate-clinical-subset.py call and expand file with the vital status columns
cut -f1 $OUTPUT_DIRECTORY/data_clinical_supp_patient.txt > $OUTPUT_DIRECTORY/vital_status.txt
$PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/expand-clinical-data.py --study-id="genie" --clinical-file="$OUTPUT_DIRECTORY/vital_status.txt" --clinical-supp-file="$INPUT_DIRECTORY/ddp/ddp_vital_status.txt" --fields="YEAR_CONTACT,YEAR_DEATH,INT_CONTACT,INT_DOD,DEAD" --identifier-column-name="PATIENT_ID"
$PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/expand-clinical-data.py --study-id="genie" --clinical-file="$OUTPUT_DIRECTORY/data_clinical_supp_patient.txt" --clinical-supp-file="$INPUT_DIRECTORY/ddp/ddp_vital_status.txt" --fields="YEAR_CONTACT,YEAR_DEATH,INT_CONTACT,INT_DOD,DEAD" --identifier-column-name="PATIENT_ID"
if [ $? -gt 0 ] ; then
echo "Failed to expand $OUTPUT_DIRECTORY/vital_status.txt with YEAR_CONTACT, YEAR_DEATH, INT_CONTACT, INT_DOD, DEAD from $INPUT_DIRECTORY/ddp/ddp_vital_status.txt. Exiting..."
exit 2
Expand All @@ -106,7 +105,7 @@ if [ $STUDY_ID == "genie" ]; then
$PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/add-age-at-seq-report.py --clinical-file="$OUTPUT_DIRECTORY/data_clinical_supp_sample.txt" --seq-date-file="$INPUT_DIRECTORY/cvr/seq_date.txt" --age-file="$INPUT_DIRECTORY/ddp/ddp_age.txt" --convert-to-days="true"
if [ $? -gt 0 ] ; then
echo "Failed to add AGE_AT_SEQ_REPORT to $OUTPUT_DIRECTORY/data_clinical_supp_sample.txt using $INPUT_DIRECTORY/cvr/seq_date.txt. Exiting..."
exit 2l
exit 2
fi

# rename GENE_PANEL to SEQ_ASSAY_ID in data_clinical_supp_sample.txt
Expand All @@ -123,6 +122,20 @@ if [ $STUDY_ID == "genie" ]; then
# remove germline mutations from maf
grep -v 'GERMLINE' $OUTPUT_DIRECTORY/data_mutations_extended.txt > $OUTPUT_DIRECTORY/data_mutations_extended.txt.tmp
mv $OUTPUT_DIRECTORY/data_mutations_extended.txt.tmp $OUTPUT_DIRECTORY/data_mutations_extended.txt
fi

# merge duplicate variants from maf
$PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/remove-duplicate-maf-variants.py --input-maf-file="$OUTPUT_DIRECTORY/data_mutations_extended.txt"
if [ $? -gt 0 ] ; then
echo "Failed to merge duplicate CNA records in $OUTPUT_DIRECTORY/data_CNA.txt. Exiting.."
exit 2
fi

# merge CNA records for certain gene duplicates
$PYTHON_BINARY $PORTAL_SCRIPTS_DIRECTORY/merge-cna-records.py --input-cnafile="$OUTPUT_DIRECTORY/data_CNA.txt" --output-cna-filepath="$OUTPUT_DIRECTORY/"
if [ $? -gt 0 ] ; then
echo "Failed to merge duplicate CNA records in $OUTPUT_DIRECTORY/data_CNA.txt. Exiting.."
exit 2
fi
fi
else
Expand Down