gtf-scripts/gtf_refflat.py at main · pfizer-opensource/gtf-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3

"""
Convert GTF to UCSC's "refFlat" format, as required by CollectRnaSeqMetrics.

Description from UCSC schema documentation:

  **Gene Predictions and RefSeq Genes**

  The following definition is used for gene prediction tables. In alternative-splicing situations, each transcript has a row in this table.

  A version of genePred that associates the gene name with the gene prediction information. In alternative splicing situations each transcript has a row in this table.

  **table refFlat**

  ::

    'A gene prediction with additional geneName field.'

      string  geneName;           "Name of gene as it appears in Genome Browser."
      string  name;               "Name of gene"
      string  chrom;              "Chromosome name"
      char[1] strand;             "+ or - for strand"
      uint    txStart;            "Transcription start position"
      uint    txEnd;              "Transcription end position"
      uint    cdsStart;           "Coding region start"
      uint    cdsEnd;             "Coding region end"
      uint    exonCount;          "Number of exons"
      uint[exonCount] exonStarts; "Exon start positions"
      uint[exonCount] exonEnds;   "Exon end positions"

  notes:
    - for transcripts without cds regions, refFlat outputs 'txEnd' for both 'cdsStart' and 'cdsEnd'
    - exon position lists have a trailing ',' for some reason
    - exons need to be sorted by start position (this only cost me 2 days of my life)

"""

__author__ = 'Dmitri Bichko, Rob Moccia'
__version__ = '0.1'

import gtf
import argparse
import sys
from itertools import chain


def configure_args(parser):
    parser.add_argument('--input', required=True, help='GTF file')
    parser.add_argument('--output', required=True, help='output file')
    parser.add_argument('--version', action='version',
        version='%(prog)s {version}'.format(version=__version__))

def process(args):

    num_records = 0
    num_written = 0
    num_gene = 0
    num_tran = 0
    num_exon = 0
    num_cds = 0

    tran = None

    # ensembl gtf files are sorted by gene_id and transcript_id
    with open(args.output, 'w') as writer:
        with open(args.input, 'r') as reader:
            for rec in chain(gtf.open(reader, 'ensembl'), [None]):

                # chained end marker to flush the last record
                if not rec or (tran and tran.id != rec.transcript_id):

                    if not tran.cds_start:
                        tran.cds_start, tran.cds_stop = tran.stop, tran.stop
                    else:
                        num_cds += 1

                    tran.exons.sort(key=lambda e: e.start)
                    flat = [tran.gene_id, tran.transcript_id, tran.chr, tran.strand, tran.start, tran.stop, tran.cds_start, tran.cds_stop]
                    flat += [len(tran.exons)]
                    flat += [','.join(str(e.start) for e in tran.exons) + ',']
                    flat += [','.join(str(e.stop) for e in tran.exons) + ',']
                    writer.write('\t'.join(str(f) for f in flat) + '\n')

                    num_written += 1
                    if not rec:
                        break

                # ucsc is "half-open" zero index
                rec.start = rec.start - 1
                num_records += 1

                if rec.feature == 'gene':
                    num_gene += 1
                    tran = None
                elif rec.feature == 'transcript':
                    num_tran += 1
                    tran = rec
                    tran.exons = []
                    tran.cds_start = None
                    tran.cds_stop = None
                elif rec.feature == 'CDS':
                    if not tran.cds_start or rec.start < tran.cds_start:
                        tran.cds_start = rec.start
                    if not tran.cds_stop or rec.stop > tran.cds_stop:
                        tran.cds_stop = rec.stop

                elif rec.feature == 'exon':
                    num_exon += 1
                    tran.exons.append(rec)

    return dict(records=num_records, written=num_written, genes=num_gene, transcripts=num_tran, exons=num_exon, cds=num_cds)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    configure_args(parser)
    res = process(parser.parse_args())
    print(res, file=sys.stderr)