-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgtf_refflat.py
More file actions
executable file
·119 lines (90 loc) · 4.23 KB
/
gtf_refflat.py
File metadata and controls
executable file
·119 lines (90 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
"""
Convert GTF to UCSC's "refFlat" format, as required by CollectRnaSeqMetrics.
Description from UCSC schema documentation:
**Gene Predictions and RefSeq Genes**
The following definition is used for gene prediction tables. In alternative-splicing situations, each transcript has a row in this table.
A version of genePred that associates the gene name with the gene prediction information. In alternative splicing situations each transcript has a row in this table.
**table refFlat**
::
'A gene prediction with additional geneName field.'
string geneName; "Name of gene as it appears in Genome Browser."
string name; "Name of gene"
string chrom; "Chromosome name"
char[1] strand; "+ or - for strand"
uint txStart; "Transcription start position"
uint txEnd; "Transcription end position"
uint cdsStart; "Coding region start"
uint cdsEnd; "Coding region end"
uint exonCount; "Number of exons"
uint[exonCount] exonStarts; "Exon start positions"
uint[exonCount] exonEnds; "Exon end positions"
notes:
- for transcripts without cds regions, refFlat outputs 'txEnd' for both 'cdsStart' and 'cdsEnd'
- exon position lists have a trailing ',' for some reason
- exons need to be sorted by start position (this only cost me 2 days of my life)
"""
__author__ = 'Dmitri Bichko, Rob Moccia'
__version__ = '0.1'
import gtf
import argparse
import sys
from itertools import chain
def configure_args(parser):
parser.add_argument('--input', required=True, help='GTF file')
parser.add_argument('--output', required=True, help='output file')
parser.add_argument('--version', action='version',
version='%(prog)s {version}'.format(version=__version__))
def process(args):
num_records = 0
num_written = 0
num_gene = 0
num_tran = 0
num_exon = 0
num_cds = 0
tran = None
# ensembl gtf files are sorted by gene_id and transcript_id
with open(args.output, 'w') as writer:
with open(args.input, 'r') as reader:
for rec in chain(gtf.open(reader, 'ensembl'), [None]):
# chained end marker to flush the last record
if not rec or (tran and tran.id != rec.transcript_id):
if not tran.cds_start:
tran.cds_start, tran.cds_stop = tran.stop, tran.stop
else:
num_cds += 1
tran.exons.sort(key=lambda e: e.start)
flat = [tran.gene_id, tran.transcript_id, tran.chr, tran.strand, tran.start, tran.stop, tran.cds_start, tran.cds_stop]
flat += [len(tran.exons)]
flat += [','.join(str(e.start) for e in tran.exons) + ',']
flat += [','.join(str(e.stop) for e in tran.exons) + ',']
writer.write('\t'.join(str(f) for f in flat) + '\n')
num_written += 1
if not rec:
break
# ucsc is "half-open" zero index
rec.start = rec.start - 1
num_records += 1
if rec.feature == 'gene':
num_gene += 1
tran = None
elif rec.feature == 'transcript':
num_tran += 1
tran = rec
tran.exons = []
tran.cds_start = None
tran.cds_stop = None
elif rec.feature == 'CDS':
if not tran.cds_start or rec.start < tran.cds_start:
tran.cds_start = rec.start
if not tran.cds_stop or rec.stop > tran.cds_stop:
tran.cds_stop = rec.stop
elif rec.feature == 'exon':
num_exon += 1
tran.exons.append(rec)
return dict(records=num_records, written=num_written, genes=num_gene, transcripts=num_tran, exons=num_exon, cds=num_cds)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
configure_args(parser)
res = process(parser.parse_args())
print(res, file=sys.stderr)