-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgtf.py
More file actions
executable file
·117 lines (88 loc) · 4.53 KB
/
Copy pathgtf.py
File metadata and controls
executable file
·117 lines (88 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Tools for working with GTF files.
GTF Format:
Fields must be tab-separated. Also, all but the final field in each feature line must contain a value; "empty" columns should be denoted with a '.'
seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below.
source - name of the program that generated this feature, or the data source (database or project name)
feature - feature type name, e.g. Gene, Variation, Similarity
start - Start position of the feature, with sequence numbering starting at 1.
end - End position of the feature, with sequence numbering starting at 1.
score - A floating point value.
strand - defined as + (forward) or - (reverse).
frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..
attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature.
Some fields are renamed to match our common convention.
1 havana gene 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";
attr_format: setting the attribute format to 'ensembl' will use a simplified, and somewhat faster, parser for the attribute list, which is then expected to be well-formed.
otherwise, the permissive regex described below is used.
"""
import re
re_attrs = re.compile(r'(\w+)(?:\s*=\s*|\s+)(?:"(.*?)"|(.*?));\s*')
id_keys = [
('gene_id', 'gene_version'),
('transcript_id', 'transcript_version'),
('protein_id', 'protein_version'),
]
# attribute tags are optionally quoted, and separated by wither whitespace or '='
#
# the match pattern is:
# - word
# - '=' OR whitespace
# - '*' any '*' OR any (non-greedy match)
# - ';'
# - optional whitespace
def open(reader, attr_format=None, keep_line=False, append_versions=False):
# strip only line terminators; by default, rstrip() will remove trailing tabs
lines = (line.rstrip('\r\n') for line in reader if not line[0] == '#')
for line in lines:
rec = parse_line(line, attr_format, append_versions)
yield (rec, line) if keep_line else rec
def parse_line(line, attr_format=None, append_versions=False):
# 8 splits, 9 fields; attribute field may include tabs
cols = [f if f != '.' else None for f in line.split('\t', 8)]
rec = GtfRecord()
rec.chr = cols[0]
rec.source = cols[1]
rec.feature = cols[2]
rec.start = int(cols[3]) if cols[3] is not None else None
rec.stop = int(cols[4]) if cols[4] is not None else None
rec.score = float(cols[5]) if cols[5] is not None else None
rec.strand = cols[6]
rec.frame = int(cols[7]) if cols[7] is not None else None
if attr_format == 'ensembl':
rec.meta = dict((t, v[1:-1]) for t, v in (tag.split(' ', 1) for tag in cols[8].rstrip(';').split('; ')))
if append_versions:
keys = [k for k in rec.meta.keys() if k.endswith('_id')]
for key, ver in id_keys:
if key in rec.meta and ver in rec.meta:
rec.meta[key] = rec.meta[key] + '.' + rec.meta.pop(ver)
else:
rec.meta = dict((m[0], m[1] or m[2]) for m in re_attrs.findall(cols[8]))
if 'gene_id' in rec.meta:
rec.gene_id = rec.meta['gene_id']
if rec.feature == 'gene':
rec.id = rec.gene_id
if 'transcript_id' in rec.meta:
rec.transcript_id = rec.meta['transcript_id']
if rec.feature == 'transcript':
rec.id = rec.transcript_id
return rec
class GtfRecord:
def __init__(self):
self.chr = None
self.source = None
self.feature = None
self.start = None
self.stop = None
self.score = None
self.strand = None
self.frame = None
self.meta = None
self.id = None
self.gene_id = None
self.transcript_id = None
def fields(self):
fmeta = ' '.join(f'{k} "{v}";' for k, v in self.meta.items())
return [self.chr, self.source, self.feature, self.start, self.stop, self.score, self.strand, self.frame, fmeta]
def __str__(self):
return '\t'.join(str(f if f is not None else '.') for f in self.fields())