Skip to content

Commit 96885a2

Browse files
authored
Merge pull request #90 from nf-core/add-metagroot-domain-annot
Add metagroot domain annot
2 parents e4da269 + 972f149 commit 96885a2

19 files changed

Lines changed: 261 additions & 52 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
### `Added`
99

10+
- [#90](https://github.com/nf-core/proteinannotator/pull/90) - Added the option to download and use the latest `metagRoot` HMM library (or use path to an existing one) for domain annotation. (by @angelphanth)
1011
- [#87](https://github.com/nf-core/proteinannotator/pull/87) - Added the option to download and use the latest `NMPFams` HMM library (or use path to an existing one) for domain annotation. (by @npechl)
1112
- [#85](https://github.com/nf-core/proteinannotator/pull/85) - Added zenodo doi in `nextflow.config`. (by @vagkaratzas)
1213

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Generate input amino acid sequence statistics with ([`SeqFu`](https://github.com
3737
### Annotate sequences
3838

3939
1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases
40-
such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/), [FunFam](https://download.cathdb.info/cath/releases/all-releases/), and [NMPFams](https://pavlopoulos-lab.org/envofams/databases/hmmer/)
40+
such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/), [FunFam](https://download.cathdb.info/cath/releases/all-releases/), and [NMPFams and metagRoot](https://pavlopoulos-lab.org/envofams/databases/hmmer/)
4141
2. Functional annotation:
4242
- ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.
4343
3. Predict secondary structure compositional features such as α-helices, β-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred))

conf/modules.config

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,14 @@ process {
9898
]
9999
}
100100

101+
withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:ARIA2_METAGROOT' {
102+
publishDir = [
103+
path: { "${params.outdir}/downloaded_dbs/" },
104+
mode: params.publish_dir_mode,
105+
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
106+
]
107+
}
108+
101109
withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_PFAM' {
102110
ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" }
103111
publishDir = [
@@ -128,6 +136,16 @@ process {
128136
]
129137
}
130138

139+
withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_METAGROOT' {
140+
ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" }
141+
publishDir = [
142+
path: { "${params.outdir}/domain_annotation/metagroot/" },
143+
mode: params.publish_dir_mode,
144+
pattern: "*.domtbl.gz",
145+
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
146+
]
147+
}
148+
131149
withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:ARIA2' {
132150
publishDir = [
133151
path: { "${params.outdir}/downloaded_dbs/" },

conf/test.config

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@ params {
2525
// Input data
2626
input = params.pipelines_testdata_base_path + 'proteinannotator/samplesheet/samplesheet.csv'
2727
// Domain annotation
28-
pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz'
29-
funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz'
30-
nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz'
28+
pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz'
29+
funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz'
30+
nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz'
31+
metagroot_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/metagroot/metagroot_test.hmm.gz'
3132
// Functional annotation
3233
interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan/interproscan_test.tar.gz'
3334
interproscan_applications = 'Hamap,TIGRFAM,sfld'

conf/test_full.config

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ params {
1717
// Input data for full size test
1818
input = params.pipelines_testdata_base_path + 'proteinannotator/samplesheet/samplesheet.csv'
1919
// Domain annotation
20-
pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz'
21-
funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz'
22-
nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz'
20+
pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz'
21+
funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz'
22+
nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz'
23+
metagroot_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/metagroot/metagroot_test.hmm.gz'
2324
// Functional annotation
2425
interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan_test.tar.gz'
2526
interproscan_applications = 'Hamap,TIGRFAM,sfld'

docs/output.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
1414
- [SeqFu](#seqfu) for input amino acid sequences quality control (QC)
1515
- [SeqKit](#seqkit) for preprocessing input amino acid sequences (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences)
1616
- [Database download](#database-download) Optionally download selected databases for annotation.
17-
- [aria2](#aria2) - To optionally download the Pfam, FunFam, NMPFams and/or InterProScan databases through the pipeline.
17+
- [aria2](#aria2) - To optionally download the Pfam, FunFam, NMPFams, metagRoot and/or InterProScan databases through the pipeline.
1818
- [Domain annotation](#domain-annotation) Annotate proteins with domains from established repositories.
19-
- [hmmer](#hmmer) - To optionally match the input sequence to known Pfam, FunFam and/or NMPFams domains through `hmmer/hmmsearch`
19+
- [hmmer](#hmmer) - To optionally match the input sequence to known Pfam, FunFam, NMPFams and/or metagRoot domains through `hmmer/hmmsearch`
2020
- [Functional annotation](#functional-annotation) Annotate proteins with functional domains
2121
- [InterProScan](#Interproscan) - Search the InterProScan database for functional domains
2222
- [s4pred](#s4pred) - Predict secondary structures of sequences, producing amino acid level probabilities of forming an α-helix, a β-strand or a coil.
@@ -73,10 +73,11 @@ The `seqkit` module is used for initial preprocessing (i.e., gap removal, conver
7373
- `interproscan_test.tar.gz`: (optional) the downloaded InterProScan archive of member databases according to the optional user-provided url
7474
- `funfam-hmm3-v4_3_0*.lib.gz`: (optional) The latest (v4_3_0) full, or a minimal test, FunFam HMM database that can be downloaded through the pipeline.
7575
- `nmpfamsdb.hmm.gz`: (optional) The latest full, or a minimal test, NMPFams HMM database that can be downloaded through the pipeline.
76+
- `metagroot.hmm.gz`: (optional) The latest full, or a minimal test, metagRoot HMM database that can be downloaded through the pipeline.
7677

7778
</details>
7879

79-
If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_nmpfams`, `skip_interproscan`) for each annotation database is set to `true`, or the `*_db` parameter paths (e.g., `pfam_db`, `funfam_db`, `nmpfams_db`, `interproscan_db`) are set (i.e., not `null`), or the run is resumed after a successful database download, then the respective database will not be (re)downloaded. The full database links can be found in the main `nextflow.config` file, while minimal test versions can be found in the `test` and `test_full` profiles (i.e., `conf/test.config`, `conf/test_full.config`).
80+
If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_nmpfams`, `skip_metagroot`, `skip_interproscan`) for each annotation database is set to `true`, or the `*_db` parameter paths (e.g., `pfam_db`, `funfam_db`, `nmpfams_db`, `metagroot_db`, `interproscan_db`) are set (i.e., not `null`), or the run is resumed after a successful database download, then the respective database will not be (re)downloaded. The full database links can be found in the main `nextflow.config` file, while minimal test versions can be found in the `test` and `test_full` profiles (i.e., `conf/test.config`, `conf/test_full.config`).
8081

8182
[aria2](https://github.com/aria2/aria2/) is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink.
8283

@@ -94,10 +95,12 @@ If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_nmpfams`, `skip_i
9495
- `<samplename>.domtbl.gz`: `hmmer/hmmsearch` results along parameters info.
9596
- `nmpfams/`
9697
- `<samplename>.domtbl.gz`: `hmmer/hmmsearch` results along parameters info.
98+
- `metagroot/`
99+
- `<samplename>.domtbl.gz`: `hmmer/hmmsearch` results along parameters info.
97100

98101
</details>
99102

100-
Each of the `domain_annotation/` subfolders (e.g., `pfam`, `funfam`, `nmpfams`) contain a `.domtbl.gz` annotation file per input sample, depending on which domain annotation databases were used in the pipeline execution.
103+
Each of the `domain_annotation/` subfolders (e.g., `pfam`, `funfam`, `nmpfams`, `metagroot`) contain a `.domtbl.gz` annotation file per input sample, depending on which domain annotation databases were used in the pipeline execution.
101104

102105
[hmmer](https://github.com/EddyRivasLab/hmmer) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others.
103106

docs/usage.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
## Introduction
88

99
**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics and generates sequence-level annotations for amino acid sequences.
10-
It takes a protein FASTA file as input and performs conserved domain annotation (using Pfam, FunFam and NMPFams HMM databases), functional annotation (using InterProScan), and secondary structure prediction (using s4pred).
10+
It takes a protein FASTA file as input and performs conserved domain annotation (using Pfam, FunFam, NMPFams and metagRoot HMM databases), functional annotation (using InterProScan), and secondary structure prediction (using s4pred).
1111
Optionally, paths to pre-downloaded databases can be provided to skip the automatic download steps and speed up repeated runs.
1212

1313
## Samplesheet input

main.nf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ workflow NFCORE_PROTEINANNOTATOR {
4949
params.skip_nmpfams,
5050
params.nmpfams_db,
5151
params.nmpfams_latest_link,
52+
params.skip_metagroot,
53+
params.metagroot_db,
54+
params.metagroot_latest_link,
5255
params.skip_interproscan,
5356
params.interproscan_db_url,
5457
params.interproscan_db,

nextflow.config

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ params {
2828
skip_nmpfams = false
2929
nmpfams_db = null
3030
nmpfams_latest_link = "https://pavlopoulos-lab.org/envofams/databases/hmmer/nmpfamsdb.hmm.gz"
31+
skip_metagroot = false
32+
metagroot_db = null
33+
metagroot_latest_link = "https://pavlopoulos-lab.org/envofams/databases/hmmer/metagroot.hmm.gz"
3134
hmmsearch_evalue_cutoff = 0.001
3235

3336
// Functional annotation

nextflow_schema.json

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,23 @@
293293
"default": "https://pavlopoulos-lab.org/envofams/databases/hmmer/nmpfamsdb.hmm.gz",
294294
"description": ""
295295
},
296+
"skip_metagroot": {
297+
"type": "boolean",
298+
"fa_icon": "fas fa-ban",
299+
"description": "Skip the domain annotation with the metagRoot database.",
300+
"help": "Skips the domain annotation of input sequence against a metagRoot database."
301+
},
302+
"metagroot_db": {
303+
"type": "string",
304+
"format": "file-path",
305+
"description": "Path to an already installed metagRoot HMM database (.hmm.gz).",
306+
"help_text": "If left null and skip_metagroot is false, the pipeline will start downloading the latest metagRoot HMM library."
307+
},
308+
"metagroot_latest_link": {
309+
"type": "string",
310+
"default": "https://pavlopoulos-lab.org/envofams/databases/hmmer/metagroot.hmm.gz",
311+
"description": "metagRoot hosted link to the latest available metagRoot HMM database file."
312+
},
296313
"hmmsearch_evalue_cutoff": {
297314
"type": "number",
298315
"default": 0.001,
@@ -379,7 +396,6 @@
379396
{
380397
"$ref": "#/$defs/domain_annotation_params"
381398
},
382-
383399
{
384400
"$ref": "#/$defs/functional_annotation_options"
385401
},

0 commit comments

Comments
 (0)