From c0d09c7ed90762e10c7d06dadb2e7841641b056a Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Fri, 20 Jan 2023 19:12:49 +0100 Subject: [PATCH] add plain and uniprot flavor --- mzio-py/src/fasta/entries/mod.rs | 5 + mzio-py/src/fasta/entries/plain.rs | 65 ++++++ .../fasta/{entry.rs => entries/uniprot.rs} | 49 ++--- mzio-py/src/fasta/mod.rs | 9 +- mzio-py/src/fasta/reader.rs | 37 ---- mzio-py/src/fasta/readers/mod.rs | 5 + mzio-py/src/fasta/readers/plain.rs | 40 ++++ mzio-py/src/fasta/readers/uniprot.rs | 40 ++++ mzio-py/src/fasta/writers/mod.rs | 5 + .../src/fasta/{writer.rs => writers/plain.rs} | 21 +- mzio-py/src/fasta/writers/uniprot.rs | 47 ++++ mzio-py/src/lib.rs | 10 +- mzio-py/tests/test_fasta.py | 93 +++++++- mzio-rs/src/fasta/entry.rs | 86 +++----- mzio-rs/src/fasta/headers/mod.rs | 20 ++ mzio-rs/src/fasta/headers/plain.rs | 44 ++++ mzio-rs/src/fasta/headers/uniprot.rs | 206 ++++++++++++++++++ mzio-rs/src/fasta/mod.rs | 37 +++- mzio-rs/src/fasta/reader.rs | 169 ++------------ mzio-rs/src/fasta/writer.rs | 103 +++------ 20 files changed, 719 insertions(+), 372 deletions(-) create mode 100644 mzio-py/src/fasta/entries/mod.rs create mode 100644 mzio-py/src/fasta/entries/plain.rs rename mzio-py/src/fasta/{entry.rs => entries/uniprot.rs} (56%) delete mode 100644 mzio-py/src/fasta/reader.rs create mode 100644 mzio-py/src/fasta/readers/mod.rs create mode 100755 mzio-py/src/fasta/readers/plain.rs create mode 100644 mzio-py/src/fasta/readers/uniprot.rs create mode 100644 mzio-py/src/fasta/writers/mod.rs rename mzio-py/src/fasta/{writer.rs => writers/plain.rs} (59%) mode change 100644 => 100755 create mode 100644 mzio-py/src/fasta/writers/uniprot.rs mode change 100755 => 100644 mzio-py/src/lib.rs mode change 100755 => 100644 mzio-rs/src/fasta/entry.rs create mode 100755 mzio-rs/src/fasta/headers/mod.rs create mode 100755 mzio-rs/src/fasta/headers/plain.rs create mode 100755 mzio-rs/src/fasta/headers/uniprot.rs mode change 100755 => 100644 mzio-rs/src/fasta/mod.rs mode change 100755 => 100644 mzio-rs/src/fasta/reader.rs mode change 100755 => 100644 mzio-rs/src/fasta/writer.rs diff --git a/mzio-py/src/fasta/entries/mod.rs b/mzio-py/src/fasta/entries/mod.rs new file mode 100644 index 0000000..74bb7ec --- /dev/null +++ b/mzio-py/src/fasta/entries/mod.rs @@ -0,0 +1,5 @@ +/// Contains wrappers for FASTA headers. +/// + +pub mod plain; +pub mod uniprot; \ No newline at end of file diff --git a/mzio-py/src/fasta/entries/plain.rs b/mzio-py/src/fasta/entries/plain.rs new file mode 100644 index 0000000..cbb536b --- /dev/null +++ b/mzio-py/src/fasta/entries/plain.rs @@ -0,0 +1,65 @@ +// 3rd party imports +use pyo3::prelude::*; +use mzio::fasta::entry::Entry as BaseEntry; +use mzio::fasta::headers::{ + Header, + plain::Plain as BasePlain +}; + +/// Wrapper for the rust implementation entry +/// +#[pyclass] +pub struct Plain { + base_entry: BaseEntry +} + +#[pymethods] +impl Plain { + /// Python constructor + /// + /// # Arguments + /// + #[new] + fn new(header: String, sequence: String) -> Self { + Self { + base_entry: BaseEntry::new( + BasePlain::new(&header), + sequence + ) + } + } + + /// Returns the header + /// + #[getter] + pub fn header(&self) -> PyResult<&str> { + Ok(&self.base_entry.get_header().get_header()) + } + + /// Returns the amino acid sequence + /// + #[getter] + pub fn sequence(&self) -> PyResult<&str> { + Ok(&self.base_entry.get_sequence()) + } +} + + +impl From> for Plain { + /// Convert entry from the Rust implementation to the python wrapper. + /// + /// # Arguments + /// + /// * `base_entry` - Plain from rust implementation + fn from(base_entry: BaseEntry) -> Self { + Self { + base_entry + } + } +} + +impl<'a> Into<&'a BaseEntry> for &'a Plain { + fn into(self) -> &'a BaseEntry { + &self.base_entry + } +} diff --git a/mzio-py/src/fasta/entry.rs b/mzio-py/src/fasta/entries/uniprot.rs similarity index 56% rename from mzio-py/src/fasta/entry.rs rename to mzio-py/src/fasta/entries/uniprot.rs index 01031fc..44cd0f6 100644 --- a/mzio-py/src/fasta/entry.rs +++ b/mzio-py/src/fasta/entries/uniprot.rs @@ -4,37 +4,30 @@ use std::collections::HashMap; // 3rd party imports use pyo3::prelude::*; use mzio::fasta::entry::Entry as BaseEntry; +use mzio::fasta::headers::{ + Header, + uniprot::UniProt as BaseUniProt +}; + /// Wrapper for the rust implementation entry /// #[pyclass] -pub struct Entry { - base_entry: BaseEntry +pub struct UniProt { + base_entry: BaseEntry } #[pymethods] -impl Entry { +impl UniProt { /// Python constructor /// /// # Arguments /// - /// * `database` - The FASTA database - /// * `accession` - Entry accession - /// * `entry_name` - Entry name - /// * `protein_name` - Protein name - /// * `keyword_attributes` - Additional keyword attributes, e.g. OX=381666 - /// * `sequence` - Amino acid sequence - /// #[new] - fn new(database: String, accession: String, entry_name: String, protein_name: String, - keyword_attributes: HashMap, sequence: String) -> Self { + fn new(header: String, sequence: String) -> Self { Self { base_entry: BaseEntry::new( - database, - accession, - entry_name, - protein_name, - keyword_attributes, + BaseUniProt::new(&header), sequence ) } @@ -44,28 +37,28 @@ impl Entry { /// #[getter] pub fn database(&self) -> PyResult<&str> { - Ok(&self.base_entry.get_database()) + Ok(&self.base_entry.get_header().get_database()) } /// Returns the accession /// #[getter] pub fn accession(&self) -> PyResult<&str> { - Ok(&self.base_entry.get_accession()) + Ok(&self.base_entry.get_header().get_accession()) } - /// Entry name + /// UniProt name /// #[getter] pub fn entry_name(&self) -> PyResult<&str> { - Ok(&self.base_entry.get_entry_name()) + Ok(&self.base_entry.get_header().get_entry_name()) } /// Returns the protein name /// #[getter] pub fn protein_name(&self) -> PyResult<&str> { - Ok(&self.base_entry.get_protein_name()) + Ok(&self.base_entry.get_header().get_protein_name()) } /// Returns additional keyword attributes, e.g @@ -76,7 +69,7 @@ impl Entry { #[getter] pub fn keyword_attributes(&self) -> PyResult> { // TODO: avoid clone? - Ok(self.base_entry.get_keyword_attributes().clone()) + Ok(self.base_entry.get_header().get_keyword_attributes().clone()) } /// Returns the amino acid sequence @@ -88,21 +81,21 @@ impl Entry { } -impl From for Entry { +impl From> for UniProt { /// Convert entry from the Rust implementation to the python wrapper. /// /// # Arguments /// - /// * `base_entry` - Entry from rust implementation - fn from(base_entry: BaseEntry) -> Self { + /// * `base_entry` - UniProt from rust implementation + fn from(base_entry: BaseEntry) -> Self { Self { base_entry } } } -impl<'a> Into<&'a BaseEntry> for &'a Entry { - fn into(self) -> &'a BaseEntry { +impl<'a> Into<&'a BaseEntry> for &'a UniProt { + fn into(self) -> &'a BaseEntry { &self.base_entry } } diff --git a/mzio-py/src/fasta/mod.rs b/mzio-py/src/fasta/mod.rs index 20f5188..b39c834 100644 --- a/mzio-py/src/fasta/mod.rs +++ b/mzio-py/src/fasta/mod.rs @@ -1,3 +1,6 @@ -pub mod entry; -pub mod reader; -pub mod writer; +/// Contains python wrappers for FASTA I/O. +/// + +pub mod entries; +pub mod readers; +pub mod writers; diff --git a/mzio-py/src/fasta/reader.rs b/mzio-py/src/fasta/reader.rs deleted file mode 100644 index 6dad5be..0000000 --- a/mzio-py/src/fasta/reader.rs +++ /dev/null @@ -1,37 +0,0 @@ -// std imports -use std::path::PathBuf; - -// 3rd party modules -use pyo3::prelude::*; -use anyhow::Result; -use mzio::fasta::reader::Reader as BaseReader; - -// internal imports -use crate::fasta::entry::Entry; - -#[pyclass] -pub struct Reader { - base_reader: BaseReader -} - -#[pymethods] -impl Reader { - #[new] - fn new(fasta_file_path: PathBuf, buffer_size: usize) -> Result { - match BaseReader::new(&fasta_file_path, buffer_size) { - Ok(base_reader) => Ok(Self{base_reader}), - Err(err) => Err(err) - } - } - - fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { - slf - } - - fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { - match slf.base_reader.next() { - Some(base_entry) => Some(Entry::from(base_entry)), - None => None - } - } -} diff --git a/mzio-py/src/fasta/readers/mod.rs b/mzio-py/src/fasta/readers/mod.rs new file mode 100644 index 0000000..74bb7ec --- /dev/null +++ b/mzio-py/src/fasta/readers/mod.rs @@ -0,0 +1,5 @@ +/// Contains wrappers for FASTA headers. +/// + +pub mod plain; +pub mod uniprot; \ No newline at end of file diff --git a/mzio-py/src/fasta/readers/plain.rs b/mzio-py/src/fasta/readers/plain.rs new file mode 100755 index 0000000..50cdab0 --- /dev/null +++ b/mzio-py/src/fasta/readers/plain.rs @@ -0,0 +1,40 @@ +// std imports +use std::path::PathBuf; + +// 3rd party modules +use pyo3::prelude::*; +use anyhow::{Result}; +use mzio::fasta::headers::plain::Plain as BasePlain; +use mzio::fasta::reader::Reader as BaseReader; + +// internal imports +use crate::fasta::entries::plain::Plain; + + +/// A FASTA reader which returns the header in plain text. +/// +#[pyclass] +pub struct PlainReader { + base_reader: BaseReader +} + +#[pymethods] +impl PlainReader { + #[new] + fn new(fasta_file_path: PathBuf, buffer_size: usize) -> Result { + Ok(Self { + base_reader: BaseReader::::new(&fasta_file_path, buffer_size)? + }) + } + + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { + match slf.base_reader.next() { + Some(base_entry) => Some(Plain::from(base_entry)), + None => None + } + } +} diff --git a/mzio-py/src/fasta/readers/uniprot.rs b/mzio-py/src/fasta/readers/uniprot.rs new file mode 100644 index 0000000..1e3960b --- /dev/null +++ b/mzio-py/src/fasta/readers/uniprot.rs @@ -0,0 +1,40 @@ +// std imports +use std::path::PathBuf; + +// 3rd party modules +use pyo3::prelude::*; +use anyhow::{Result}; +use mzio::fasta::headers::uniprot::UniProt as BaseUniProt; +use mzio::fasta::reader::Reader as BaseReader; + +// internal imports +use crate::fasta::entries::uniprot::UniProt; + + +/// A FASTA reader which parses the UniProt formatted header format. +/// +#[pyclass] +pub struct UniProtReader { + base_reader: BaseReader +} + +#[pymethods] +impl UniProtReader { + #[new] + fn new(fasta_file_path: PathBuf, buffer_size: usize) -> Result { + Ok(Self { + base_reader: BaseReader::::new(&fasta_file_path, buffer_size)? + }) + } + + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { + match slf.base_reader.next() { + Some(base_entry) => Some(UniProt::from(base_entry)), + None => None + } + } +} diff --git a/mzio-py/src/fasta/writers/mod.rs b/mzio-py/src/fasta/writers/mod.rs new file mode 100644 index 0000000..2b5c91f --- /dev/null +++ b/mzio-py/src/fasta/writers/mod.rs @@ -0,0 +1,5 @@ +/// Contains FASTA writers. +/// + +pub mod plain; +pub mod uniprot; diff --git a/mzio-py/src/fasta/writer.rs b/mzio-py/src/fasta/writers/plain.rs old mode 100644 new mode 100755 similarity index 59% rename from mzio-py/src/fasta/writer.rs rename to mzio-py/src/fasta/writers/plain.rs index 92cd91c..eb1faca --- a/mzio-py/src/fasta/writer.rs +++ b/mzio-py/src/fasta/writers/plain.rs @@ -4,19 +4,22 @@ use std::path::PathBuf; // 3rd party modules use anyhow::Result; use pyo3::prelude::*; +use mzio::fasta::headers::plain::Plain as BasePlain; use mzio::fasta::writer::Writer as BaseWriter; // internal imports -use crate::fasta::entry::Entry; +use crate::fasta::entries::plain::Plain; +/// A FASTA writer which writes the header in plain text. +/// #[pyclass] -pub struct Writer { - base_writer: BaseWriter +pub struct PlainWriter { + base_writer: BaseWriter } #[pymethods] -impl Writer { +impl PlainWriter { /// Creates a new Writer /// /// # Arguments @@ -25,20 +28,16 @@ impl Writer { /// #[new] pub fn new(fasta_file_path: PathBuf) -> PyResult { - match BaseWriter::new(&fasta_file_path) { - Ok(base_writer) => Ok(Self{base_writer}), - Err(err) => Err(err.into()) - } + Ok(Self { base_writer: BaseWriter::::new(&fasta_file_path)? }) } - pub fn write_entry(&mut self, entry: &Entry, sort_keyword_attributes: bool) -> Result { - match self.base_writer.write_entry(entry.into(), sort_keyword_attributes) { + pub fn write_entry(&mut self, entry: &Plain) -> Result { + match self.base_writer.write_entry(entry.into()) { Ok(written_bytes) => Ok(written_bytes), Err(err) => Err(err) } } - pub fn flush(&mut self) -> Result<()> { match self.base_writer.flush() { Ok(_) => Ok(()), diff --git a/mzio-py/src/fasta/writers/uniprot.rs b/mzio-py/src/fasta/writers/uniprot.rs new file mode 100644 index 0000000..3f709cc --- /dev/null +++ b/mzio-py/src/fasta/writers/uniprot.rs @@ -0,0 +1,47 @@ +// std imports +use std::path::PathBuf; + +// 3rd party modules +use anyhow::Result; +use pyo3::prelude::*; +use mzio::fasta::headers::uniprot::UniProt as BaseUniProt; +use mzio::fasta::writer::Writer as BaseWriter; + +// internal imports +use crate::fasta::entries::uniprot::UniProt; + + +/// A FASTA writer which writes the header in UniProt format. +/// +#[pyclass] +pub struct UniProtWriter { + base_writer: BaseWriter +} + +#[pymethods] +impl UniProtWriter { + /// Creates a new Writer + /// + /// # Arguments + /// + /// * `fasta_file_path` - Path to FASTA file + /// + #[new] + pub fn new(fasta_file_path: PathBuf) -> PyResult { + Ok(Self { base_writer: BaseWriter::::new(&fasta_file_path)? }) + } + + pub fn write_entry(&mut self, entry: &UniProt) -> Result { + match self.base_writer.write_entry(entry.into()) { + Ok(written_bytes) => Ok(written_bytes), + Err(err) => Err(err) + } + } + + pub fn flush(&mut self) -> Result<()> { + match self.base_writer.flush() { + Ok(_) => Ok(()), + Err(err) => Err(err) + } + } +} \ No newline at end of file diff --git a/mzio-py/src/lib.rs b/mzio-py/src/lib.rs old mode 100755 new mode 100644 index ae30b19..4716a64 --- a/mzio-py/src/lib.rs +++ b/mzio-py/src/lib.rs @@ -19,9 +19,13 @@ fn mzio_py(py: Python, m: &PyModule) -> PyResult<()> { /// `parent_module` - Parent module of the fasta module fn register_fasta_module(py: Python, parent_module: &PyModule) -> PyResult<()> { let child_module = PyModule::new(py, "fasta")?; - child_module.add_class::()?; - child_module.add_class::()?; - child_module.add_class::()?; + child_module.add_class::()?; + child_module.add_class::()?; + child_module.add_class::()?; + child_module.add_class::()?; + child_module.add_class::()?; + child_module.add_class::()?; + // child_module.add_class::()?; parent_module.add_submodule(child_module)?; Ok(()) } diff --git a/mzio-py/tests/test_fasta.py b/mzio-py/tests/test_fasta.py index 3d83147..1b04a6a 100644 --- a/mzio-py/tests/test_fasta.py +++ b/mzio-py/tests/test_fasta.py @@ -1,7 +1,11 @@ +# std imports +from itertools import pairwise from pathlib import Path -from typing import ClassVar +import re +from typing import ClassVar, List, Tuple, Type import unittest +# internal imports from mzio_py import fasta @@ -10,17 +14,22 @@ class FastaModuleTestCase(unittest.TestCase): TEST_WRITE_FASTA_FILE: ClassVar[Path] = Path("../test_files/fasta/mouse.fasta.tmp") TEST_NON_EXISTING_FASTA_FILE: ClassVar[Path] = Path("../test_files/fasta/non_existing.fasta") - def test_read_write(self): - reader = fasta.Reader(self.__class__.TEST_READ_FASTA_FILE, 1024) + READER_WRITER_PAIRS: ClassVar[List[Tuple[Type, Type]]] = [ + (fasta.PlainReader, fasta.PlainWriter), + (fasta.UniProtReader, fasta.UniProtWriter) + ] + + def test_read_write_plain(self): + reader = fasta.PlainReader(self.__class__.TEST_READ_FASTA_FILE, 1024) entries = [ entry for entry in reader ] - writer = fasta.Writer(self.__class__.TEST_WRITE_FASTA_FILE) + writer = fasta.PlainWriter(self.__class__.TEST_WRITE_FASTA_FILE) for entry in entries: - writer.write_entry(entry, True) + writer.write_entry(entry) writer.flush() @@ -35,4 +44,76 @@ def test_read_write(self): self.__class__.TEST_WRITE_FASTA_FILE.unlink(missing_ok=True) with self.assertRaises(RuntimeError): - fasta.Reader(self.__class__.TEST_NON_EXISTING_FASTA_FILE, 1024) + fasta.PlainReader(self.__class__.TEST_NON_EXISTING_FASTA_FILE, 1024) + + def __split_keyword_attributes(self, keyword_attributes: str) -> List[str]: + """Simple function to split the keyword attributes of a UniProt header line. + + Parameters + ---------- + keyword_attributes : str + Keyword attributes of FASTA header, e.g. `OS=Zika virus (isolate ZIKV/Human/French Polynesia/10087PF/2013) OX=2043570 PE=1 SV=1` + + Returns + ------- + List[str] + List of keyword attributes, e.g. `["OS=Zika virus (isolate ZIKV/Human/French Polynesia/10087PF/2013)", "OX=2043570", "PE=1", "SV=1"]` + """ + # keep it simple for testing and don't rely on complex regular expressions. + # just find the start of each keyword attribute (`SOMEKEY=`) and slice them. + start_positions: List[int] = [match.start() for match in re.finditer(r"[A-Z]+=", keyword_attributes)] + # add the end of the line + start_positions.append(len(keyword_attributes)) + return [ keyword_attributes[start:end].strip() for (start, end) in pairwise(start_positions) ] + + def test_read_write_uniprot(self): + """ + Tests UniProtReader. + The rust implementation of the UniProt entry will sort the keyword arguments in test mode. + Unfortunately, it is not passed down to the imported `mzio` crate. Therefore, the test cannot compare the lines directly + and has to split the keyword attributes and compare them separately. + """ + reader = fasta.UniProtReader(self.__class__.TEST_READ_FASTA_FILE, 1024) + + entries = [ + entry for entry in reader + ] + + writer = fasta.UniProtWriter(self.__class__.TEST_WRITE_FASTA_FILE) + + for entry in entries: + writer.write_entry(entry) + + writer.flush() + + del writer + + with self.__class__.TEST_READ_FASTA_FILE.open("r") as in_file: + with self.__class__.TEST_WRITE_FASTA_FILE.open("r") as out_file: + for in_line, out_line in zip(in_file, out_file): + if not in_line.startswith(">"): + self.assertEqual(in_line, out_line) + else: + # find the first equals sign + first_equals_sign_pos: int = in_line.find("=") + # find the preceding whitespace (up to this point, the header should contains everything but the keyword attributes) + first_whitespace_pos: int = in_line.rfind(" ", 0, first_equals_sign_pos) + # compare the first part of the header + self.assertEqual(in_line[:first_whitespace_pos], out_line[:first_whitespace_pos]) + + # split them + in_keyword_attributes = self.__split_keyword_attributes(in_line[first_whitespace_pos:]) + out_keyword_attributes = self.__split_keyword_attributes(out_line[first_whitespace_pos:]) + + # compare length + self.assertEqual(len(in_keyword_attributes), len(out_keyword_attributes)) + + for out_keyword_attribute in out_keyword_attributes: + self.assertIn(out_keyword_attribute, in_keyword_attributes) + + + self.__class__.TEST_WRITE_FASTA_FILE.unlink(missing_ok=True) + + with self.assertRaises(RuntimeError): + fasta.UniProtReader(self.__class__.TEST_NON_EXISTING_FASTA_FILE, 1024) + diff --git a/mzio-rs/src/fasta/entry.rs b/mzio-rs/src/fasta/entry.rs old mode 100755 new mode 100644 index de4230f..787d81b --- a/mzio-rs/src/fasta/entry.rs +++ b/mzio-rs/src/fasta/entry.rs @@ -1,73 +1,39 @@ -use std::collections::HashMap; +// std imports +use std::fmt::Display; -/// Keeps all information of FASTA entry -pub struct Entry { - database: String, - accession: String, - entry_name: String, - protein_name: String, - keyword_attributes: HashMap, +// internal imports +use crate::fasta::headers::Header; + +/// Keeps all information of a FASTA entry +/// +pub struct Entry where T: Header { + header: T, sequence: String } -impl Entry { +impl Entry where T: Header + Display { /// Creates a new FASTA entry /// # Arguments /// - /// * `database` - The FASTA database - /// * `accession` - Entry accession - /// * `entry_name` - Entry name - /// * `protein_name` - Protein name - /// * `keyword_attributes` - Additional keyword attributes, e.g. OX=381666 + /// * `header` - Header /// * `sequence` - Amino acid sequence /// - pub fn new(database: String, accession: String, entry_name: String, protein_name: String, - keyword_attributes: HashMap, sequence: String) -> Self { - Self { - database, - accession, - entry_name, - protein_name, - keyword_attributes, - sequence - } - } - - /// Returns the database type - /// - pub fn get_database(&self) -> &str { - &self.database.as_str() - } - - /// Returns the accession - /// - pub fn get_accession(&self) -> &str { - &self.accession - } - - /// Entry name - /// - pub fn get_entry_name(&self) -> &str { - &self.entry_name + pub fn new(header: T, sequence: String) -> Self { + Self { + header, + sequence } + } - /// Returns the protein name - /// - pub fn get_protein_name(&self) -> &str { - &self.protein_name - } - - /// Returns additional keyword attributes, e.g - /// * OX = 381666 - /// * GN = acoX - /// - pub fn get_keyword_attributes(&self) -> &HashMap { - &self.keyword_attributes - } + /// Returns the amino acid sequence of the FASTA entry + /// + pub fn get_sequence(&self) -> &str { + &self.sequence + } - /// Returns the amino acid sequence - /// - pub fn get_sequence(&self) -> &str { - &self.sequence - } + /// Returns the header of the FASTA entry + /// + pub fn get_header(&self) -> &T { + &self.header + } } \ No newline at end of file diff --git a/mzio-rs/src/fasta/headers/mod.rs b/mzio-rs/src/fasta/headers/mod.rs new file mode 100755 index 0000000..b9e8a33 --- /dev/null +++ b/mzio-rs/src/fasta/headers/mod.rs @@ -0,0 +1,20 @@ +/// Contains headers for parsing different FASTA headers of different formats. + +// std imports +use std::fmt; + +// publish modules +pub mod plain; +pub mod uniprot; + +/// Trait for defining a header. +/// +/// Display trait should rebuild the header in it's original form. +/// +pub trait Header: fmt::Display { + /// Create header from FASTA header + /// + /// # Arguments + /// * `header` - FASTA header + fn new(header: &str) -> Self; +} diff --git a/mzio-rs/src/fasta/headers/plain.rs b/mzio-rs/src/fasta/headers/plain.rs new file mode 100755 index 0000000..de022ac --- /dev/null +++ b/mzio-rs/src/fasta/headers/plain.rs @@ -0,0 +1,44 @@ +// std imports +use std::fmt; + +// local imports +use crate::fasta::headers::Header; + +/// A plain header, which is just a string. +/// +pub struct Plain { + header: String +} + +impl Plain { + pub fn get_header(&self) -> &str { + &self.header + } +} + +impl Header for Plain { + fn new(header: &str) -> Self { + Self { + header: header.to_owned() + } + } +} + +impl fmt::Display for Plain { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.header) + } +} + +#[cfg(test)] +mod test { + use super::*; + + const TEST_HEADER: &'static str = ">sp|P27748|ACOX_CUPNH Acetoin catabolism protein X OS=Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337) OX=381666 GN=acoX PE=4 SV=2"; + + #[test] + fn test_display() { + let plain = Plain::new(TEST_HEADER); + assert_eq!(plain.to_string(), TEST_HEADER.to_owned()); + } +} \ No newline at end of file diff --git a/mzio-rs/src/fasta/headers/uniprot.rs b/mzio-rs/src/fasta/headers/uniprot.rs new file mode 100755 index 0000000..cd5ec2a --- /dev/null +++ b/mzio-rs/src/fasta/headers/uniprot.rs @@ -0,0 +1,206 @@ +// std imports +use std::collections::HashMap; +use std::fmt; + +// 3rd party imports +use crate::fasta::headers::Header; + +/// A parsed UniProt header. +/// Keeps the database, accession, entry name, protein name and keyword attributes. +/// +pub struct UniProt { + database: String, + accession: String, + entry_name: String, + protein_name: String, + keyword_attributes: HashMap +} + +impl UniProt { + /// Returns the database of the UniProt header. + /// + pub fn get_database(&self) -> &str { + &self.database + } + + /// Returns the accession of the UniProt header. + /// + pub fn get_accession(&self) -> &str { + &self.accession + } + + /// Returns the entry name of the UniProt header. + /// + pub fn get_entry_name(&self) -> &str { + &self.entry_name + } + + /// Returns the protein name of the UniProt header. + /// + pub fn get_protein_name(&self) -> &str { + &self.protein_name + } + + /// Returns the keyword attributes of the UniProt header, like gene (GN), organism (OS), etc. + /// + pub fn get_keyword_attributes(&self) -> &HashMap { + &self.keyword_attributes + } + + /// Processes and adds a keyword attribute to the HashMap + /// # Arguments + /// + /// * `raw_attr` - Raw attributes, e.g. `key=value with spaces` + /// * `keyword_attributes` - Additional keyword attributes + /// + fn prep_and_add_attribute_to_keyword_attributes(raw_attr: &str, keyword_attributes: &mut HashMap) { + let attr_split = raw_attr.split("=").collect::>(); + if let Some(key) = attr_split.get(0) { + if let Some(value) = attr_split.get(1) { + keyword_attributes.insert( + key.to_string(), + value.to_string() + ); + } + } + } + + /// Creates a UniProt header of the given header. + /// + /// # Arguments + /// + /// * `header` - A FASTA header + /// + fn internal_new(header: &str) -> Self { + // Split by '|' and extract database and accession + let mut header_split = header.split("|").collect::>(); + let mut database: String = header_split.remove(0).to_string(); + database = database.as_str()[1..].to_string(); // remove '>' + let accession: String = header_split.remove(0).to_string(); + + // Split by ' ' + header_split = header_split.remove(0).split(" ").collect::>(); + + // Extract entry name + let entry_name: String = header_split.remove(0).to_string(); + // Add chunks to protein name until first string with '=' occurs (begin of keyword attributes) + let mut protein_name: String = header_split.remove(0).to_string(); + loop { + if let Some(chunk) = header_split.get(0) { + if !chunk.contains("=") { + protein_name.push_str(" "); + protein_name.push_str(header_split.remove(0)); + } else { + break + } + } + } + // Extract keyword attributes + let mut keyword_attributes: HashMap = HashMap::new(); + if header_split.len() > 0 { + let mut current_attr: String = String::new(); + while header_split.len() > 0 { + if let Some(chunk) = header_split.get(0) { + // Every time a chunk does not start a new attribute (chunk does not contains '=') + // add the chunk to the current attribute, otherwise process the current attribute + // and begin a new one + if !chunk.contains("=") { + current_attr.push_str(" "); + current_attr.push_str(header_split.remove(0)); + } else { + Self::prep_and_add_attribute_to_keyword_attributes( + ¤t_attr, + &mut keyword_attributes + ); + if header_split.len() > 0 { + current_attr = header_split.remove(0).to_string(); + } + } + } + } + // Process the remaining attribute + Self::prep_and_add_attribute_to_keyword_attributes( + ¤t_attr, + &mut keyword_attributes + ); + } + Self { + database, // database + accession, // accession + entry_name, + protein_name, + keyword_attributes + } + } + + /// Converts the keyword_attributes to a String, format + /// `key=value key=value` + /// + /// **Attention!** In test mode it will sort the attributes to make sure it the same output. + /// + fn keyword_attributes_to_string(&self) -> String { + if self.get_keyword_attributes().len() > 0 { + #[allow(unused_mut)] // needs to be mutable in test mode for sorting + let mut keyword_arguments: Vec = self.get_keyword_attributes().into_iter() + .map(|(key, value)| format!("{}={}", key, value)).collect(); + // Only sort in test mode to have predictable output (HashMaps are not ordered). + #[cfg(test)] + { + keyword_arguments.sort(); + } + return keyword_arguments.join(" ").to_owned(); + } + return String::new(); + } +} + +impl Header for UniProt { + fn new(header: &str) -> Self { + Self::internal_new(header) + } +} + +impl fmt::Display for UniProt { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, ">{}|{}|{} {} {}", self.get_database(), self.get_accession(), self.get_entry_name(), self.get_protein_name(), self.keyword_attributes_to_string()) + } +} + + +#[cfg(test)] +mod test { + use super::*; + + const TEST_HEADER: &'static str = ">sp|P27748|ACOX_CUPNH Acetoin catabolism protein X OS=Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337) OX=381666 GN=acoX PE=4 SV=2"; + const EXPECTED_DATABASE: &'static str = "sp"; + const EXPECTED_ACCESSION: &'static str = "P27748"; + const EXPECTED_ENTRY_NAME: &'static str = "ACOX_CUPNH"; + const EXPECTED_PROTEIN_NAME: &'static str = "Acetoin catabolism protein X"; + const EXPECTED_KEYWORD_ATTRIBUTES: [(&'static str, &'static str,); 5] = [ + ("OS", "Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)"), + ("OX", "381666"), + ("GN", "acoX"), + ("PE", "4"), + ("SV", "2") + ]; + const EXPECTED_HEADER: &'static str = ">sp|P27748|ACOX_CUPNH Acetoin catabolism protein X GN=acoX OS=Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337) OX=381666 PE=4 SV=2"; + + #[test] + fn test_creation() { + let uniprot = UniProt::new(TEST_HEADER); + assert_eq!(uniprot.get_accession(), EXPECTED_ACCESSION); + assert_eq!(uniprot.get_database(), EXPECTED_DATABASE); + assert_eq!(uniprot.get_entry_name(), EXPECTED_ENTRY_NAME); + assert_eq!(uniprot.get_protein_name(), EXPECTED_PROTEIN_NAME); + for (key, value) in EXPECTED_KEYWORD_ATTRIBUTES.iter() { + let key_value = uniprot.get_keyword_attributes().get_key_value(*key); + assert_eq!(key_value, Some((&(*key).to_owned(), &(*value).to_owned()))); + } + } + + #[test] + fn test_display() { + let uniprot = UniProt::new(TEST_HEADER); + assert_eq!(uniprot.to_string(), EXPECTED_HEADER.to_owned()); + } +} \ No newline at end of file diff --git a/mzio-rs/src/fasta/mod.rs b/mzio-rs/src/fasta/mod.rs old mode 100755 new mode 100644 index 561dbd7..ca8d67e --- a/mzio-rs/src/fasta/mod.rs +++ b/mzio-rs/src/fasta/mod.rs @@ -1,4 +1,6 @@ +/// Contains FASTA I/O. pub mod entry; +pub mod headers; pub mod reader; pub mod writer; @@ -6,33 +8,60 @@ pub mod writer; mod test { use super::*; + use std::fmt::Display; use std::fs; use std::path::Path; + use crate::fasta::headers::{ + Header, + plain::Plain, + uniprot::UniProt + }; + + const FASTA_FILE_PATH_STR: &'static str = "./test_files/fasta/partial_mouse.fasta"; const EXPECTED_NUM_PROTEINS: usize = 10; const TEMP_FASTA_PATH_STR: &'static str = "./test_files/fasta/partial_mouse.fasta.tmp"; + + /// Test reading and writing of a FASTA without parsing the header + /// #[test] + fn test_plain_header() { + test_reading_and_writing::("plain"); + } + + /// Test reading and writing header while parsing the headers UniProt style + /// + #[test] + fn test_uniprot_header() { + test_reading_and_writing::("uniprot"); + } + /// Reads a FASTA file, parses the proteins, /// write them back into a temporary file and compares it with the original one. - fn test_reading_and_writing() { + /// + /// # Arguments + /// `tmp_file_suffix` - Suffix for the temporary file + /// + fn test_reading_and_writing(tmp_file_suffix: &'static str) where T: Header + Display { let fasta_file_path = Path::new(FASTA_FILE_PATH_STR); - let tmp_fasta_file_path = Path::new(TEMP_FASTA_PATH_STR); + let tmp_file_path_str = format!("{}.{}", TEMP_FASTA_PATH_STR, tmp_file_suffix); + let tmp_fasta_file_path = Path::new(&tmp_file_path_str); let reader = reader::Reader::new( fasta_file_path, 1024 ).unwrap(); - let entries: Vec = reader.into_iter().collect(); + let entries: Vec> = reader.into_iter().collect(); assert_eq!(entries.len(), EXPECTED_NUM_PROTEINS); let mut writer = writer::Writer::new( tmp_fasta_file_path ).unwrap(); - writer.write_all(entries.iter(), true).unwrap(); + writer.write_all(entries.iter()).unwrap(); writer.flush().unwrap(); let tmp_fasta_content = fs::read_to_string(tmp_fasta_file_path).unwrap().trim().to_string(); diff --git a/mzio-rs/src/fasta/reader.rs b/mzio-rs/src/fasta/reader.rs old mode 100755 new mode 100644 index 5ddc133..e4291d9 --- a/mzio-rs/src/fasta/reader.rs +++ b/mzio-rs/src/fasta/reader.rs @@ -1,21 +1,27 @@ -use std::collections::HashMap; +// std imports use std::fs::File; use std::io::BufReader; use std::io::prelude::*; use std::path::Path; -use crate::fasta::entry::Entry; +// 3rd party imports use anyhow::Result; -/// Reader for common FASTA files as distributed by UniProt (https://uniprot.org) -pub struct Reader { +// internal imports +use crate::fasta::entry::Entry; +use crate::fasta::headers::Header; + + +/// Reader for common FASTA files as distributed by e.g. UniProt (https://uniprot.org) +pub struct Reader where T: Header { internal_reader: BufReader, is_eof: bool, header: String, - sequence: String + sequence: String, + _header_phantom: std::marker::PhantomData } -impl Reader { +impl Reader where T: Header { /// Creates a new Reader /// # Arguments /// @@ -27,102 +33,24 @@ impl Reader { internal_reader: BufReader::with_capacity(buffer_size, fasta_file), is_eof: false, header: String::new(), - sequence: String::new() + sequence: String::new(), + _header_phantom: std::marker::PhantomData }) } - /// Processes and adds a keyword attribute to the HashMap - /// # Arguments - /// - /// * `raw_attr` - Raw attributes, e.g. `key=value with spaces` - /// * `keyword_attributes` - Additional keyword attributes - /// - fn prep_and_add_attribute_to_keyword_attributes(raw_attr: &str, keyword_attributes: &mut HashMap) { - let attr_split = raw_attr.split("=").collect::>(); - if let Some(key) = attr_split.get(0) { - if let Some(value) = attr_split.get(1) { - keyword_attributes.insert( - key.to_string(), - value.to_string() - ); - } - } - } - - /// Creates a new Entry from the given header and sequence. - /// - /// # Arguments - /// - /// * `header` - A FASTA header - /// * `sequence` - Amino acid sequence - /// - pub fn create_entry(header: &str, sequence: &str) -> Option { - // Split by '|' and extract database and accession - let mut header_split = header.split("|").collect::>(); - let mut database: String = header_split.remove(0).to_string(); - database = database.as_str()[1..].to_string(); // remove '>' - let accession: String = header_split.remove(0).to_string(); - - // Split by ' ' - header_split = header_split.remove(0).split(" ").collect::>(); - - // Extract entry name - let entry_name: String = header_split.remove(0).to_string(); - // Add chunks to protein name until first string with '=' occurs (begin of keyword attributes) - let mut protein_name: String = header_split.remove(0).to_string(); - loop { - if let Some(chunk) = header_split.get(0) { - if !chunk.contains("=") { - protein_name.push_str(" "); - protein_name.push_str(header_split.remove(0)); - } else { - break - } - } - } - // Extract keyword attributes - let mut keyword_attributes: HashMap = HashMap::new(); - if header_split.len() > 0 { - let mut current_attr: String = String::new(); - while header_split.len() > 0 { - if let Some(chunk) = header_split.get(0) { - // Every time a chunk does not start a new attribute (chunk does not contains '=') - // add the chunk to the current attribute, otherwise process the current attribute - // and begin a new one - if !chunk.contains("=") { - current_attr.push_str(" "); - current_attr.push_str(header_split.remove(0)); - } else { - Reader::prep_and_add_attribute_to_keyword_attributes( - ¤t_attr, - &mut keyword_attributes - ); - if header_split.len() > 0 { - current_attr = header_split.remove(0).to_string(); - } - } - } - } - // Process the remaining attribute - Reader::prep_and_add_attribute_to_keyword_attributes( - ¤t_attr, - &mut keyword_attributes - ); - } - return Some(Entry::new( - database, // database - accession, // accession - entry_name, - protein_name, - keyword_attributes, - sequence.replace("\n", "") - )); + pub fn create_entry(header: &str, sequence: &str) -> Option> { + Some( + Entry::new( + T::new(header), + sequence.to_owned() + ) + ) } } -impl Iterator for Reader { - type Item = Entry; +impl Iterator for Reader where T: Header { + type Item = Entry; fn next(&mut self) -> Option { if self.is_eof { @@ -153,54 +81,3 @@ impl Iterator for Reader { } } - - -#[cfg(test)] -mod test { - use super::*; - - const TEST_HEADER: &'static str = ">sp|P27748|ACOX_CUPNH Acetoin catabolism protein X OS=Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337) OX=381666 GN=acoX PE=4 SV=2"; - const TEST_SEQUENCE: &'static str = "MGHAAGASAQIAPVVGIIANPISARDIRRVIANANSLQLADRVNIVLRLLAALASCGVER -VLMMPDREGLRVMLARHLARRQGPDSGLPAVDYLDMPVTARVDDTLRAARCMADAGVAAI -IVLGGDGTHRAVVRECGAVPIAGLSTGTNNAYPEMREPTIIGLATGLYATGRIPPAQALA -SNKRLDIVIRDGNGGFRRDIALVDAVISHEHFIGARALWKTDTLAAVYVSFADPEAIGLS -SIAGLLEPVGRREEGGLAIELAAPGEGEFDLCAPIAPGLMCTVPVAGWQRLEHGRPHRVR -QRSGIVALDGERELAFGPDDEVTVTLHDHAFRSIDVAACMRHAGRHHLMRSLPQPAAVG"; - const EXPECTED_DATABASE: &'static str = "sp"; - const EXPECTED_ACCESSION: &'static str = "P27748"; - const EXPECTED_ENTRY_NAME: &'static str = "ACOX_CUPNH"; - const EXPECTED_PROTEIN_NAME: &'static str = "Acetoin catabolism protein X"; - const EXPECTED_KEYWORD_ATTRIBUTES: [(&'static str, &'static str,); 5] = [ - ("OS", "Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)"), - ("OX", "381666"), - ("GN", "acoX"), - ("PE", "4"), - ("SV", "2") - ]; - const EXPECTED_SEQUENCE: &'static str = "MGHAAGASAQIAPVVGIIANPISARDIRRVIANANSLQLADRVNIVLRLLAALASCGVER\ - VLMMPDREGLRVMLARHLARRQGPDSGLPAVDYLDMPVTARVDDTLRAARCMADAGVAAI\ - IVLGGDGTHRAVVRECGAVPIAGLSTGTNNAYPEMREPTIIGLATGLYATGRIPPAQALA\ - SNKRLDIVIRDGNGGFRRDIALVDAVISHEHFIGARALWKTDTLAAVYVSFADPEAIGLS\ - SIAGLLEPVGRREEGGLAIELAAPGEGEFDLCAPIAPGLMCTVPVAGWQRLEHGRPHRVR\ - QRSGIVALDGERELAFGPDDEVTVTLHDHAFRSIDVAACMRHAGRHHLMRSLPQPAAVG"; - - #[test] - /// Tests the creation of a FASTA entry from a header and a sequence. - /// - fn test_entry_creation() { - let entry = Reader::create_entry(TEST_HEADER, TEST_SEQUENCE).unwrap(); - assert_eq!(entry.get_database(), EXPECTED_DATABASE); - assert_eq!(entry.get_accession(), EXPECTED_ACCESSION); - assert_eq!(entry.get_entry_name(), EXPECTED_ENTRY_NAME); - assert_eq!(entry.get_protein_name(), EXPECTED_PROTEIN_NAME); - assert_eq!(entry.get_sequence(), EXPECTED_SEQUENCE); - - for key_value in EXPECTED_KEYWORD_ATTRIBUTES { - assert!(entry.get_keyword_attributes().contains_key(key_value.0)); - assert_eq!( - entry.get_keyword_attributes().get(key_value.0).unwrap(), - key_value.1 - ) - } - } -} diff --git a/mzio-rs/src/fasta/writer.rs b/mzio-rs/src/fasta/writer.rs old mode 100755 new mode 100644 index 02bb567..f0fc2ae --- a/mzio-rs/src/fasta/writer.rs +++ b/mzio-rs/src/fasta/writer.rs @@ -1,21 +1,28 @@ +// std imports use std::fs::File; use std::io::BufWriter; use std::io::prelude::*; use std::path::Path; -use crate::fasta::entry::Entry; +// 3rd party imports use anyhow::Result; +// internal imports +use crate::fasta::entry::Entry; +use crate::fasta::headers::Header; + + /// Max amino acids per sequence line. const MAX_AMINO_ACIDS_PER_LINE: usize = 60; /// Writer for common FASTA files as distributed by UniProt (https://uniprot.org) -/// Use flush() to mak ensure the buffer is written completely. -pub struct Writer { - internal_writer: BufWriter +/// Use flush() to make sure the buffer is written completely. +pub struct Writer where T: Header { + internal_writer: BufWriter, + _header_phantom: std::marker::PhantomData } -impl Writer { +impl<'a, T> Writer where T: Header + 'a{ /// Creates a new Writer /// /// # Arguments @@ -25,39 +32,11 @@ impl Writer { pub fn new(fasta_file_path: &Path) -> Result { let fasta_file: File = File::create(fasta_file_path)?; Ok(Self { - internal_writer: BufWriter::new(fasta_file) + internal_writer: BufWriter::new(fasta_file), + _header_phantom: std::marker::PhantomData }) } - /// Creates a fasta header from the given entry. - /// - /// # Arguments - /// - /// * `entry` - FASTA entry - /// * `sort_keyword_attributes` - If true the keyword attributes will be sorted (for testing and readability reasons) - /// - fn create_header(entry: &Entry, sort_keyword_attributes: bool) -> String { - let mut header = ">".to_string(); - header.push_str(entry.get_database()); - header.push_str("|"); - header.push_str(entry.get_accession()); - header.push_str("|"); - header.push_str(entry.get_entry_name()); - header.push_str(" "); - header.push_str(entry.get_protein_name()); - if entry.get_keyword_attributes().len() > 0 { - header.push_str(" "); - let mut keyword_arguments: Vec = entry.get_keyword_attributes().into_iter() - .map(|(key, value)| format!("{}={}", key, value)).collect(); - if sort_keyword_attributes { - keyword_arguments.sort(); - - } - header.push_str(&keyword_arguments.join(" ")); - } - return header; - } - /// Splits sequence into chunk of MAX_AMINO_ACIDS_PER_LINE. /// /// # Arguments @@ -78,14 +57,13 @@ impl Writer { /// # Arguments /// /// * `entry` - FASTA entry - /// * `sort_keyword_attributes` - If true the keyword attributes will be sorted (for testing and readability reasons) /// - pub fn write_entry(&mut self, entry: &Entry, sort_keyword_attributes: bool) -> Result { + pub fn write_entry(&mut self, entry: &Entry) -> Result { let mut written_bytes: usize = 0; - written_bytes += self.internal_writer.write(Self::create_header(entry, sort_keyword_attributes).as_bytes())?; - written_bytes += self.internal_writer.write(b"\n")?; - written_bytes += self.internal_writer.write(Self::format_sequence(entry.get_sequence()).as_bytes())?; - written_bytes += self.internal_writer.write(b"\n")?; + written_bytes += self.internal_writer.write(format!( + "{}\n{}\n", entry.get_header(), + Self::format_sequence(entry.get_sequence()) + ).as_bytes())?; return Ok(written_bytes); } @@ -96,13 +74,13 @@ impl Writer { /// * `entires` - Iterator of FASTA entries /// * `sort_keyword_attributes` - If true the keyword attributes will be sorted (for testing and readability reasons) /// - pub fn write_all<'b, I>(&mut self, entries: I, sort_keyword_attributes: bool) -> Result + pub fn write_all<'b, I>(&mut self, entries: I) -> Result where - I: Iterator, + I: Iterator>, { let mut written_bytes: usize = 0; for entry in entries { - written_bytes += self.write_entry(entry, sort_keyword_attributes)?; + written_bytes += self.write_entry(entry)?; } return Ok(written_bytes); } @@ -117,28 +95,19 @@ impl Writer { #[cfg(test)] mod test { - use std::collections::HashMap; - use super::*; + use crate::fasta::headers::{ + plain::Plain + }; + const TEST_SEQUENCE: &'static str = "MGHAAGASAQIAPVVGIIANPISARDIRRVIANANSLQLADRVNIVLRLLAALASCGVER\ VLMMPDREGLRVMLARHLARRQGPDSGLPAVDYLDMPVTARVDDTLRAARCMADAGVAAI\ IVLGGDGTHRAVVRECGAVPIAGLSTGTNNAYPEMREPTIIGLATGLYATGRIPPAQALA\ SNKRLDIVIRDGNGGFRRDIALVDAVISHEHFIGARALWKTDTLAAVYVSFADPEAIGLS\ SIAGLLEPVGRREEGGLAIELAAPGEGEFDLCAPIAPGLMCTVPVAGWQRLEHGRPHRVR\ QRSGIVALDGERELAFGPDDEVTVTLHDHAFRSIDVAACMRHAGRHHLMRSLPQPAAVG"; - const TEST_DATABASE: &'static str = "sp"; - const TEST_ACCESSION: &'static str = "P27748"; - const TEST_ENTRY_NAME: &'static str = "ACOX_CUPNH"; - const TEST_PROTEIN_NAME: &'static str = "Acetoin catabolism protein X"; - const TEST_KEYWORD_ATTRIBUTES: [(&'static str, &'static str,); 5] = [ - ("OS", "Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)"), - ("OX", "381666"), - ("GN", "acoX"), - ("PE", "4"), - ("SV", "2") - ]; - const EXPECTED_HEADER: &'static str = ">sp|P27748|ACOX_CUPNH Acetoin catabolism protein X GN=acoX OS=Cupriavidus necator (strain ATCC 17699 / H16 / DSM 428 / Stanier 337) OX=381666 PE=4 SV=2"; + const EXPECTED_SEQUENCE: &'static str = "MGHAAGASAQIAPVVGIIANPISARDIRRVIANANSLQLADRVNIVLRLLAALASCGVER VLMMPDREGLRVMLARHLARRQGPDSGLPAVDYLDMPVTARVDDTLRAARCMADAGVAAI IVLGGDGTHRAVVRECGAVPIAGLSTGTNNAYPEMREPTIIGLATGLYATGRIPPAQALA @@ -148,24 +117,10 @@ QRSGIVALDGERELAFGPDDEVTVTLHDHAFRSIDVAACMRHAGRHHLMRSLPQPAAVG"; #[test] /// Tests the creation of a FASTA entry from a header and a sequence. + /// Same for ech header header. /// fn test_seqeunce_formatting() { - let formatted_sequence = Writer::format_sequence(TEST_SEQUENCE); + let formatted_sequence = Writer::::format_sequence(TEST_SEQUENCE); assert_eq!(formatted_sequence, EXPECTED_SEQUENCE) } - - #[test] - /// Reads a FASTA file, parses the proteins and counts protein - fn test_header_creation() { - let entry = Entry::new( - TEST_DATABASE.to_string(), - TEST_ACCESSION.to_string(), - TEST_ENTRY_NAME.to_string(), - TEST_PROTEIN_NAME.to_string(), - TEST_KEYWORD_ATTRIBUTES.into_iter().map(|elem| (elem.0.to_string(), elem.1.to_string())).collect::>(), - TEST_SEQUENCE.to_string() - ); - let header = Writer::create_header(&entry, true); - assert_eq!(header, EXPECTED_HEADER); - } }