forked from BioFSharp/BioFSharp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path03_02_fastq_parsing.fsx
More file actions
87 lines (70 loc) · 2.77 KB
/
03_02_fastq_parsing.fsx
File metadata and controls
87 lines (70 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
(**
---
title: Fastq
category: BioParsers
categoryindex: 3
index: 1
---
*)
(*** hide ***)
(*** condition: prepare ***)
#r "nuget: FSharpAux, 1.1.0"
#r "nuget: FSharpAux.IO, 1.1.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET, 2.0.0-preview.18"
#r "../src/BioFSharp/bin/Release/netstandard2.0/BioFSharp.dll"
#r "../src/BioFSharp.IO/bin/Release/netstandard2.0/BioFSharp.IO.dll"
#r "../src/BioFSharp.BioContainers/bin/Release/netstandard2.0/BioFSharp.BioContainers.dll"
#r "../src/BioFSharp.ML/bin/Release/netstandard2.0/BioFSharp.ML.dll"
#r "../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll"
(*** condition: ipynb ***)
#if IPYNB
#r "nuget: FSharpAux, 1.1.0"
#r "nuget: FSharpAux.IO, 1.1.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET, 2.0.0-preview.18"
#r "nuget: Plotly.NET.Interactive, 2.0.0-preview.18"
#r "nuget: BioFSharp, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.IO, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.BioContainers, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.ML, {{fsdocs-package-version}}"
#r "nuget: BioFSharp.Stats, {{fsdocs-package-version}}"
#endif // IPYNB
(**
# Fastq parsing
[](https://mybinder.org/v2/gh/CSBiology/BioFSharp/gh-pages?filepath={{fsdocs-source-basename}}.ipynb) 
[]({{root}}{{fsdocs-source-basename}}.fsx) 
[]({{root}}{{fsdocs-source-basename}}.ipynb)
*Summary:* This example shows how to parse and write fastq formatted files with BioFSharp
This module allows to parse FASTQ format data with original 4-lines entries into this record type
*)
/// FastqItem record contains header, sequence, qualityheader, qualitysequence of one entry
type FastqItem<'a,'b> = {
Header : string
Sequence : 'a
QualityHeader : string
QualitySequence : 'b
}
(**
To be able to use this parser you need to define two converter functions,
one example for each you can also find in our module, but you also may need to write your own.
If you have following possible values for quality sequence:
```txt
!""#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
```
with Sanger format, that can encode a Phred quality score from 0 to 93 using ASCII 33 to 126,
then you can use our converting function:
*)
/// get Phred quality score
let qualityConvertFn (string:string) =
string.ToCharArray()
|> Array.map (fun i -> int i - 33)
(**
And then you can easily use this module to read your FastQ file
*)
open BioFSharp
open BioFSharp.IO
let yourFastqFile = (__SOURCE_DIRECTORY__ + "/data/FastQtest.fastq")
let FastQSequence =
FastQ.fromFile BioArray.ofAminoAcidString qualityConvertFn yourFastqFile
(***include-value:FastQSequence***)