forked from BioFSharp/BioFSharp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path03_03_gff3_parsing.fsx
More file actions
241 lines (195 loc) · 8.52 KB
/
03_03_gff3_parsing.fsx
File metadata and controls
241 lines (195 loc) · 8.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
(**
---
title: GFF3
category: BioParsers
categoryindex: 3
index: 2
---
*)
(*** hide ***)
(*** condition: prepare ***)
#r "nuget: FSharpAux, 1.1.0"
#r "nuget: FSharpAux.IO, 1.1.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET, 2.0.0-preview.18"
#r "../src/BioFSharp/bin/Release/netstandard2.0/BioFSharp.dll"
#r "../src/BioFSharp.IO/bin/Release/netstandard2.0/BioFSharp.IO.dll"
#r "../src/BioFSharp.BioContainers/bin/Release/netstandard2.0/BioFSharp.BioContainers.dll"
#r "../src/BioFSharp.ML/bin/Release/netstandard2.0/BioFSharp.ML.dll"
#r "../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll"
(*** condition: ipynb ***)
#if IPYNB
#r "nuget: FSharpAux, 1.1.0"
#r "nuget: FSharpAux.IO, 1.1.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET, 2.0.0-preview.18"
#r "nuget: Plotly.NET.Interactive, 2.0.0-preview.18"
#r "nuget: BioFSharp, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.IO, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.BioContainers, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.ML, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.Stats, {{fsdocs-package-version}}"
#endif // IPYNB
(**
# GFF3 parsing
[](https://mybinder.org/v2/gh/CSBiology/BioFSharp/gh-pages?filepath={{fsdocs-source-basename}}.ipynb) 
[]({{root}}{{fsdocs-source-basename}}.fsx) 
[]({{root}}{{fsdocs-source-basename}}.ipynb)
*Summary:* This example shows how to parse and write gff3 formatted files with BioFSharp
## Generic Feature Format Version 3
In GFF3 files every line represents one genomic feature with nine tab-delimited fields, whereas unlimited key-value pairs can be stored in field 9.
It is possible to link multiple features to genomic units using the 'Parent tag'.
In the following you can see a GFF file example (modified version of [saccharomyces_cerevisiae.gff](http://downloads.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff)):
```text
##gff-version 3
# date Mon Feb 7 19:35:06 2005
chrI SGD gene 335 649 . + . ID=YAL069W;Name=YAL069W;Ontology_term=GO:0000004,GO:0005554,GO:0008372;Note=Hypothetical%20ORF;dbxref=SGD:S000002143;orf_classification=Dubious
chrI SGD CDS 335 649 . + 0 Parent=YAL069W;Name=YAL069W;Ontology_term=GO:0000004,GO:0005554,GO:0008372;Note=Hypothetical%20ORF;dbxref=SGD:S000002143;orf_classification=Dubious
###
##FASTA
>chrI
CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAAC
ACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCAT
TCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC
CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATAT
TGAAACGCTAACAAATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCAC
CCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTC
CACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGG
TCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAAT
ATTGTATAACTGCCCTTAATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTATGTC
AATATTACAGAAAAATCCCCACAAAAATCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAAACATATTG
```
Directives (marked with "##[...]") provide additional information like the gff-version which has to be the first line of each file ("##gff-version 3[...]").
Comment lines have to start with a single "#[...]". It is possible that sequences in FastA format are attached at the end of the file. This has to be announced by a "##FASTA" directive line.
For further information visit [GFF3-Specifications](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md).
## Reading GFF3 files
To read in a gff you have to insert a filepath and optionally a FastA converter. For further information about FastA check the [FastA section](https://csbiology.github.io/BioFSharp/FastA.html)
or visit [API Reference - FastA](https://csbiology.github.io/BioFSharp/reference/biofsharp-io-fasta.html).
*)
open BioFSharp
open BioFSharp.IO
//path of the input file
let filepathGFF = (__SOURCE_DIRECTORY__ + "/data/gff3Example.gff")
//reads from file to seq of GFFLines
//If no FASTA Sequence is included you directly can use GFF3.fromFileWithoutFasta [filepathGFF].
(***do-not-eval***)
let features = GFF3.fromFile BioFSharp.BioArray.ofNucleotideString filepathGFF
(**
```text
seq
[Directive "##gff-version 3"; Comment "# date Mon Feb 7 19:35:06 2005";
GFFEntryLine
{ Seqid = "chrI"
Source = "SGD"
Feature = "gene"
StartPos = 335
EndPos = 649
Score = nan
Strand = '+'
Phase = -1
Attributes =
map
[("ID", ["YAL069W"]); ("Name", ["YAL069W"]);
("Note", ["Hypothetical%20ORF"]);
("Ontology_term", ["GO:0000004"; "GO:0005554"; "GO:0008372"]);
("dbxref", ["SGD:S000002143"]);
("orf_classification", ["Dubious"])]
Supplement = [|"No supplement"|] };
GFFEntryLine{
...
]
```
*)
(**
## How to use GFF3SanityCheck
The GFF3SanityCheck prints wether your GFF3 file is valid or not. It returns all specified errors including the lines in which they occured.
In contrast to GFF2 the field 3 (type, feature or method) of a GFF3 entry is restricted to terms defined by the sequence ontology (SO) so this validator is able to check if the entry is a valid SO term.
You can find new versions of the SO at (https://sourceforge.net/projects/song/files/SO_Feature_Annotation).
*)
(***do-not-eval***)
//to validate the GFF file without SOTerm verification use this function and only insert the filepath
let featuresSanityCheck = GFF3.sanityCheck filepathGFF
(***do-not-eval***)
//path, name and version of the 'Sequence Ontology terms'-file
let filepathSO_Terms = (__SOURCE_DIRECTORY__ + "/data/Sequence_Ontology_Terms_2_5_3.txt")
(***do-not-eval***)
//to validate the gff file insert filepath
let featuresSanityCheckWithSOTerm = GFF3.sanityCheckWithSOTerm filepathSO_Terms filepathGFF
(**
## How to use GFF3RelationshipSearch
You also can do a simple search for "Parent - child of" relationships giving back all genomic features which contain the searchterm in **ID/Id** or **Parent** field.
*)
///Term to search for:
let searchterm = "YAL069W"
(***do-not-eval***)
///with this function you can search features which are related to the searchterm
let gffExampleSearch = GFF3.relationshipSearch features searchterm
(**
```text
seq
[{ Seqid = "chrI"
Source = "SGD"
Feature = "gene"
StartPos = 335
EndPos = 649
Score = nan
Strand = '+'
Phase = -1
Attributes =
map
[("ID", ["YAL069W"]); ("Name", ["YAL069W"]);
("Note", ["Hypothetical%20ORF"]);
("Ontology_term", ["GO:0000004"; "GO:0005554"; "GO:0008372"]);
("dbxref", ["SGD:S000002143"]); ("orf_classification", ["Dubious"])]
Supplement = [|"No supplement"|] };
{ Seqid = "chrI"
Source = "SGD"
Feature = "CDS"
StartPos = 335
EndPos = 649
Score = nan
Strand = '+'
Phase = 0
Attributes =
map
[("Name", ["YAL069W"]); ("Note", ["Hypothetical%20ORF"]);
("Ontology_term", ["GO:0000004"; "GO:0005554"; "GO:0008372"]);
("Parent", ["YAL069W"]); ("dbxref", ["SGD:S000002143"]);
("orf_classification", ["Dubious"])]
Supplement = [|"No supplement"|] }]
```
*)
(**
##Writing GFF3 files
In order to write a sequence of (GFFLine<_>) into a file use the following function.
If FastA sequences are included they are appended by a FastA writer described in the [API Reference - FastA](https://csbiology.github.io/BioFSharp/reference/biofsharp-io-fasta.html).
_Note: The order of key value pairs in field 9 (attributes) may be changed._
*)
(*** do-not-eval ***)
///Takes a seq<GFF<'a>>, a FASTA converter and a destination filepath and writes it into a .gff. Hint: Use converter = id if no FastA sequence is included.
let gffExampleWrite = GFF3.write BioItem.symbol (__SOURCE_DIRECTORY__ + "/data/gffExampleWrite.gff") features
(**
## Example: Sequence of CDS
If a FastA file is included you can look up the sequence of a CDS feature using the following function.
*)
(***do-not-eval***)
let firstCDS =
//get GFFEntries
let filteredGFFEntries =
features
|> Seq.choose (fun x ->
match x with
| GFF3.GFFEntryLine x -> Some x
| _ -> None)
//get all CDS features
let filteredCDSFeatures =
filteredGFFEntries
|> Seq.filter (fun x -> x.Feature = "CDS")
filteredCDSFeatures |> Seq.head
(***do-not-eval***)
let firstCDSSequence = GFF3.getSequence firstCDS features
(**
```text
seq [A; T; G; A; ...]
```
*)