-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmaizegdb_gff3_formatter.py
More file actions
174 lines (144 loc) · 5.71 KB
/
maizegdb_gff3_formatter.py
File metadata and controls
174 lines (144 loc) · 5.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/python3
import os
import sys
Usage = """
formats the GFF3 to final version - similar to NAM gff3 file
including the canonical transcript assignment tag
The file is acceptable for MaizeGDB submission
Usage:
maizegdb_gff3_formatter.py <renamed_agat_formatted.gff3> <canonical_transcript_ids.txt>
Input:
renamed_agat_formatted.gff3 : The gff3 file sanitized using the agat_sp_manage_IDs.pl script
typically, you should first obtain gene id prefix from maizeGDB and run this as follows
agat_sp_manage_IDs.pl --gff input.gff3 --prefix Ab00001aa --tair --output prefinal.gff3
canonical_transcript_ids.txt : list of transcript ids that are considered as primary transcript. You can run the TRaCE program to determine
the canonical transcript and create a list of mRNA ids (one per line). The number should be equal to the gene count in GFF3
eg command:
maizegdb_gff3_formatter.py renamed_agat_formatted.gff3 canonical_transcript_ids.txt > maizeGDB_specifications.gff3
Arun Seetharam
arnstrm@iastate.edu
09/15/2022
"""
if len(sys.argv) < 2:
print(Usage)
exit()
else:
cmdargs = str(sys.argv)
file1 = sys.argv[1]
file2 = sys.argv[2]
def remap_gene(oldid, oldparent):
spPrefix = oldid[0:9]
geneNum = oldid[9:]
newid = spPrefix + geneNum.zfill(6)
newparent = spPrefix + geneNum.zfill(6)
return(newid, newparent)
def remap_mrna(oldid, oldparent):
spPrefix = oldid[0:9]
tid = oldid.split(".")[1]
geneNum = oldid.split(".")[0][9:]
newid = spPrefix + geneNum.zfill(6) + "_T" + tid.zfill(3)
newparent = spPrefix + geneNum.zfill(6)
return(newid, newparent)
def remap_exon(oldid, oldparent):
spPrefix = oldid[0:9]
exonNum = oldid.split("-")[1].strip("exon")
tid = oldid.replace("-", ".").split(".")[1]
geneNum = oldid.split(".")[0][9:]
newid = spPrefix + geneNum.zfill(6) + \
"_T" + tid.zfill(3) + ".exon." + exonNum
newparent = spPrefix + geneNum.zfill(6) + "_T" + tid.zfill(3)
return(newid, newparent)
def remap_cds(oldid, oldparent):
spPrefix = oldid[0:9]
cdsNum = oldid.split("-")[1].strip("cds")
tid = oldid.replace("-", ".").split(".")[1]
geneNum = oldid.split(".")[0][9:]
newid = spPrefix + geneNum.zfill(6) + \
"_P" + tid.zfill(3)
newparent = spPrefix + geneNum.zfill(6) + "_T" + tid.zfill(3)
return(newid, newparent)
def remap_5utr(oldid, oldparent):
spPrefix = oldid[0:9]
utr = oldid.split("-")[1].strip("five_prime_utr")
tid = oldid.replace("-", ".").split(".")[1]
geneNum = oldid.split(".")[0][9:]
newid = spPrefix + geneNum.zfill(6) + \
"_T" + tid.zfill(3) + ".five_prime_utr." + utr
newparent = spPrefix + geneNum.zfill(6) + "_T" + tid.zfill(3)
return(newid, newparent)
def remap_3utr(oldid, oldparent):
spPrefix = oldid[0:9]
utr = oldid.split("-")[1].strip("three_prime_utr")
tid = oldid.replace("-", ".").split(".")[1]
geneNum = oldid.split(".")[0][9:]
newid = spPrefix + geneNum.zfill(6) + \
"_T" + tid.zfill(3) + ".three_prime_utr." + utr
newparent = spPrefix + geneNum.zfill(6) + "_T" + tid.zfill(3)
return(newid, newparent)
# read GFF file
my_gff = []
my_comments = []
with open(file1, 'r') as gff_file:
for line in gff_file:
if not line.startswith("#"):
features = line.rstrip("\n").split("\t")
my_gff.append(features)
with open(file1, 'r') as gff_file:
for line in gff_file:
if line.startswith("##"):
comments = line.rstrip("\n")
my_comments.append(comments)
# read primary ids
txt_file = open(file2, "r")
file_content = txt_file.read()
my_primary_ids = file_content.split("\n")
# process gff3 lines
for comments in my_comments:
print(comments)
for features in my_gff:
seqid, source, type, start, end, score, strand, phase, attributes = features
d = dict(x.split("=") for x in attributes.split(";"))
if type == "gene":
d["ID"], d["Name"] = remap_gene(d["ID"], d["Name"])
ID = 'ID='+d["ID"]
req_attr = [ID, 'biotype=protein_coding', 'logic_name=BIND_gene']
if type == "mRNA":
d["ID"], d["Parent"] = remap_mrna(d["ID"], d["Parent"])
ID = 'ID='+d["ID"]
Parent = 'Parent='+d["Parent"]
biotype = 'biotype=protein_coding'
transcript_id = 'transcript_id='+d["ID"]
if d["ID"] in my_primary_ids:
primary = 'canonical_transcript=1'
req_attr = [ID, Parent, biotype, transcript_id, primary]
else:
req_attr = [ID, Parent, biotype, transcript_id]
if type == "exon":
d["ID"], d["Parent"] = remap_exon(d["ID"], d["Parent"])
ID = 'ID='+d["ID"]
Parent = 'Parent='+d["Parent"]
Name = 'ID='+d["ID"]
exonID = 'ID='+d["ID"]
strsplit = exonID.split('.')
rank = 'rank='+strsplit[-1]
req_attr = [Parent, Name, exonID, rank]
if type == "CDS":
d["ID"], d["Parent"] = remap_cds(d["ID"], d["Parent"])
ID = 'ID='+d["ID"]
Parent = 'Parent='+d["Parent"]
protein_id = 'protein_id='+d["ID"]
req_attr = [ID, Parent, protein_id]
if type == "five_prime_UTR":
d["ID"], d["Parent"] = remap_5utr(d["ID"], d["Parent"])
ID = 'ID='+d["ID"]
Parent = 'Parent='+d["Parent"]
req_attr = [ID, Parent]
if type == "three_prime_UTR":
d["ID"], d["Parent"] = remap_3utr(d["ID"], d["Parent"])
ID = 'ID='+d["ID"]
Parent = 'Parent='+d["Parent"]
req_attr = [ID, Parent]
attributes = ";".join(req_attr)
source = "PanAnd"
print(seqid, source, type, start, end, score,
strand, phase, attributes, sep='\t')