-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathBuild_Trinotate_Boilerplate_SQLite_db_MODIFIED_INVERTEBRATES.pl
More file actions
154 lines (93 loc) · 5.83 KB
/
Build_Trinotate_Boilerplate_SQLite_db_MODIFIED_INVERTEBRATES.pl
File metadata and controls
154 lines (93 loc) · 5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env perl
# NOTICE: This is a modified version of the Build_Trinotate_Boilerplate_SQLite_db.pl script included in the Trinotate suite (https://trinotate.github.io/)
# It loads the invertebrates Uniprot library manually downloaded (uniprot_sprot_invertebrates.dat) instead of the standard Uniprot library (uniprot_sprot.dat).
# All other libraries downloaded and loaded as in the original script.
# Some paths have been modified.
use strict;
use warnings;
use FindBin;
use lib ("$FindBin::RealBin/../PerlLib");
use Pipeliner;
my $usage = "\n\n\tusage: $0 Database_prefix\n\n\n";
my $prefix = $ARGV[0] or die $usage;
my $UTILDIR = "$FindBin::RealBin/util";
## Resources:
my $SPROT_DAT_URL = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz";
my $EGGNOG_DAT_URL = "http://eggnogdb.embl.de/download/latest/data/NOG/NOG.annotations.tsv.gz";
my $GENE_ONTOLOGY_DAT_URL = "http://purl.obolibrary.org/obo/go/go-basic.obo";
my $PFAM_DAT_URL = "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz";
my $PFAM2GO_DAT_URL = "http://www.geneontology.org/external2go/pfam2go";
main: {
my $checkpoint_dir = "__trino_chkpts";
my $sqlite_db = "$prefix.sqlite";
if (-e $sqlite_db && ! -d $checkpoint_dir) {
print STDERR "\n\n\tERROR, database: $sqlite_db already exists. Please remove or rename it before continuing.\n\n";
exit(1);
}
my $pipeliner = new Pipeliner(-verbose => 2);
unless (-d $checkpoint_dir) {
mkdir $checkpoint_dir or die "Error, cannot mkdir $checkpoint_dir";
}
## process Sprot dat file
#$pipeliner->add_commands(new Command("wget \"$SPROT_DAT_URL\"", "$checkpoint_dir/wget_sprot_dat.ok") );
#$pipeliner->add_commands(new Command("./EMBL_swissprot_parser.pl uniprot_sprot.dat.gz $prefix", "$checkpoint_dir/parse_sprot_dat.ok"));
#$pipeliner->add_commands(new Command("mv uniprot_sprot.dat.gz.pep uniprot_sprot.pep",
# "$checkpoint_dir/rename_sprot_pep_file.ok"));
$pipeliner->add_commands(new Command("./EMBL_swissprot_parser.pl uniprot_sprot_invertebrates.dat.gz $prefix", "$checkpoint_dir/parse_sprot_dat.ok"));
$pipeliner->add_commands(new Command("mv uniprot_sprot_invertebrates.dat.gz.pep uniprot_sprot.pep",
"$checkpoint_dir/rename_sprot_pep_file.ok"));
# create sqlite database and load in the swissprot data:
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --create", "$checkpoint_dir/init_sqlite_db.ok"));
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --uniprot_index $prefix.UniprotIndex",
"$checkpoint_dir/uniprot_index_loading.ok") );
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --taxonomy_index $prefix.TaxonomyIndex",
"$checkpoint_dir/taxonomy_index_loading.ok") );
##########
## EGGNOG
$pipeliner->add_commands(new Command("wget \"$EGGNOG_DAT_URL\"", "$checkpoint_dir/eggnog_download.ok") );
# extract fields
$pipeliner->add_commands(new Command("gunzip -c NOG.annotations.tsv.gz | ./print.pl 1 5 > NOG.annotations.tsv.gz.bulk_load",
"$checkpoint_dir/eggnog_field_extraction.ok") ); # note, had set -eou pipefail, but this generated errors on certain flavors and/or versions of linux
# load
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --eggnog NOG.annotations.tsv.gz.bulk_load",
"$checkpoint_dir/eggnog.load.ok") );
################
## Gene ontology
$pipeliner->add_commands(new Command("wget \"$GENE_ONTOLOGY_DAT_URL\"", "$checkpoint_dir/go_download.ok"));
$pipeliner->add_commands(new Command("./obo_to_tab.pl go-basic.obo > go-basic.obo.tab",
"$checkpoint_dir/go_obo_to_tab.ok"));
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --go_obo_tab go-basic.obo.tab",
"$checkpoint_dir/go_obo_load.ok"));
##############
## Pfam
$pipeliner->add_commands(new Command("wget \"$PFAM_DAT_URL\"", "$checkpoint_dir/download_pfam.ok"));
$pipeliner->add_commands(new Command("./PFAM_dat_parser.pl Pfam-A.hmm.gz", "$checkpoint_dir/pfam_parsing.ok"));
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --pfam Pfam-A.hmm.gz.pfam_sqlite_bulk_load",
"$checkpoint_dir/pfam_loading.ok") );
#############
## Pfam2Go
$pipeliner->add_commands(new Command("wget \"$PFAM2GO_DAT_URL\" ",
"$checkpoint_dir/pfam2go_download.ok") );
$pipeliner->add_commands(new Command("./PFAMtoGoParser.pl pfam2go > pfam2go.tab",
"$checkpoint_dir/pfam2go_tab.ok"));
$pipeliner->add_commands(new Command("./EMBL_dat_to_Trinotate_sqlite_resourceDB.pl --sqlite $sqlite_db --pfam2go pfam2go.tab",
"$checkpoint_dir/pfam2go_tab_loading.ok"));
$pipeliner->run();
## cleaning up:
my @tmpfiles = qw(go-basic.obo
pfam2go
pfam2go.1
NOG.annotations.tsv.gz
NOG.annotations.tsv.gz.bulk_load
go-basic.obo.tab
Pfam-A.hmm.gz.pfam_sqlite_bulk_load
pfam2go.tab.tab
pfam2go.tab
);
push (@tmpfiles, "$prefix.UniprotIndex", "$prefix.TaxonomyIndex");
foreach my $file (@tmpfiles) {
unlink($file);
}
`rm -rf $checkpoint_dir`;
exit(0);
}