sep-scripts/tcga_maf_reader.py at master · carlcward/sep-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re, sys
from collections import defaultdict, OrderedDict
'''
Reads in mutation files from TCGA data and checks to see if any of the seps has a mutation in item
Required addition of additional output from sep_processor - where in the transcript each sep start (base offset)
so the coordinates of the mutation and sep could be lined up.
'''
TISSUE_SOURCE_FILE_PATH = '/Users/carlcward/Desktop/Carl/MAFs/tissueSourceSite.txt'
def RepresentsInt(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

def create_tissue_source_dict(tss_file_name):
	tss_file = open(tss_file_name,'rU')
	out_dict = {}
	for line in tss_file:
		line = line.split("\t")
		code = line[0].strip()
		name = line[2].strip()
		short = line[3].strip()
		out_dict[code] = (short,name)
	tss_file.close()
	return out_dict


def read_maf(file_name, code_dict):
	coordinate_map = defaultdict(list)
	maf = open(file_name, 'rU')
	for line in maf:
		line = line.split("\t")

		if len(line) > 5 and RepresentsInt(line[5]):
			# chromosome, mutation start coord, mutation stop coord

			chrom = line[4]
			coord_start = int(line[5])
			coord_end = int(line[6])
			coord_key = "%s" % (chrom)
			code = line[16].split("-")[1]
			code_string = code_dict[code][1]
			coord_string = "chr%s:%i-%i" % (chrom, coord_start, coord_end)
			coordinate_map[coord_key].append((coord_start,coord_end, [coord_string] +[line[0]] +[code_string] + line[7:16]))
	maf.close()
	return coordinate_map

# get the index of sep transcipts coords in the sep_processor results file (dynamic)
def read_header(sep_file):
	header = sep_file.readline().split("\t")
	index = 0
	for i in range(len(header)):

		if  'Sep Start In Transcript' in header[i].strip() or 'Sep Transcript Coords' in header[i].strip():
			return i


def read_sep_file(file_name):
	coordinate_map = OrderedDict()
	sep = open(file_name, 'rU')
	sep_coords_i = read_header(sep)
	for line in sep:
		line = line.split("\t")
		peptide_data = line[:9]
		peptide = line[1]
		peptide_key = tuple(peptide_data[8])
		if peptide_key not in coordinate_map:
			coordinate_map[peptide_key] = [peptide_data]
			tcoords = line[sep_coords_i].split(";")
			tcoords = filter(lambda c: c.strip() != '', tcoords)
			for coord_string in tcoords:
				# transcript coords from the Sep Start Column in out from sep_processor.py
				regex = re.match("chr([0-9XY]{1,2}):([0-9]*)\-([0-9]*)",coord_string)
				if regex != None:


					regex = regex.groups()
					coord_key = regex[0]
					# sep coord start = transcript coord + sep offest
					coord_start = int(regex[1]) #+ sep_start_offset
					coord_end = int(regex[2]) #coord_start + sep_length_bases
					# the exact same data already exist in the dictions, do not add it
					# other wise add in the form (start coord, end coord, peptide data)

					coordinate_map[peptide_key].append((coord_key,coord_start,coord_end))
	sep.close()
	return coordinate_map

if __name__ == '__main__':
	# collect the matches... where a known mutation lies within a TRANSCRIPT for one of the SEPs
	# (not necessarily within the sep itsself). We need a way to start the exact location info for the SEPs
	hits = OrderedDict()
	code_dict = create_tissue_source_dict(TISSUE_SOURCE_FILE_PATH)
	sep_coord = read_sep_file(sys.argv[1])
	maf_coord = read_maf(sys.argv[2], code_dict)
	out_file = open(sys.argv[3], 'w')
	# print the lengths of each dict (all values) (fancy flatten syntax)
	print (len([item for sub in sep_coord.values() for item in sub]),len([item for sub in maf_coord.values() for item in sub]))
	# for unique sep with seps
	for sep,data in sep_coord.items():
		sep_data = data[0]
		loc = 1
		# for each sep on a transcript

		for chrom,sep_start,sep_stop in data[1:]:
			#print sep_stop,sep_start, sep

			# for each mutation on the same chromosome
			for mut_start,mut_stop,mut_data in maf_coord[chrom]:
				# see if the mutation matches with the sep but dont include the same mutation for the same set of sep data
				# tuples are hashable to create a dict with them of sep data for duplicate exclusion
				if mut_start > sep_start and mut_start < sep_stop:
					#print loc, mut_start-sep_start
					mut_id = "%s,%s" % (loc + mut_start-sep_start, mut_stop-mut_start)

					if mut_id not in [mut[0] for mut in hits.get(tuple(sep_data),[])]:
						hits.setdefault(tuple(sep_data),[]).append([mut_id]+mut_data)
			loc += sep_stop - sep_start
	# collected results
	print len(hits)
	for sep,v in hits.items():
		outfile.write('\n')
		for mut in v:
			outfile.write("%s\n%s\n" % ('\t'.join(('',)+sep),'\t'.join(mut)))
	outfile.close()