diff --git a/CHANGELOG.md b/CHANGELOG.md index 202bd64..88578a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#90](https://github.com/nf-core/proteinannotator/pull/90) - Added the option to download and use the latest `metagRoot` HMM library (or use path to an existing one) for domain annotation. (by @angelphanth) - [#87](https://github.com/nf-core/proteinannotator/pull/87) - Added the option to download and use the latest `NMPFams` HMM library (or use path to an existing one) for domain annotation. (by @npechl) - [#85](https://github.com/nf-core/proteinannotator/pull/85) - Added zenodo doi in `nextflow.config`. (by @vagkaratzas) diff --git a/README.md b/README.md index 2b8f037..dac12bd 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Generate input amino acid sequence statistics with ([`SeqFu`](https://github.com ### Annotate sequences 1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases - such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/), [FunFam](https://download.cathdb.info/cath/releases/all-releases/), and [NMPFams](https://pavlopoulos-lab.org/envofams/databases/hmmer/) + such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/), [FunFam](https://download.cathdb.info/cath/releases/all-releases/), and [NMPFams and metagRoot](https://pavlopoulos-lab.org/envofams/databases/hmmer/) 2. Functional annotation: - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics. 3. Predict secondary structure compositional features such as α-helices, β-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred)) diff --git a/conf/modules.config b/conf/modules.config index b5a5635..b325242 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -98,6 +98,14 @@ process { ] } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:ARIA2_METAGROOT' { + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_PFAM' { ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } publishDir = [ @@ -128,6 +136,16 @@ process { ] } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_METAGROOT' { + ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } + publishDir = [ + path: { "${params.outdir}/domain_annotation/metagroot/" }, + mode: params.publish_dir_mode, + pattern: "*.domtbl.gz", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:ARIA2' { publishDir = [ path: { "${params.outdir}/downloaded_dbs/" }, diff --git a/conf/test.config b/conf/test.config index 9defa1c..23f4a82 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,9 +25,10 @@ params { // Input data input = params.pipelines_testdata_base_path + 'proteinannotator/samplesheet/samplesheet.csv' // Domain annotation - pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' - funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' - nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz' + pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' + funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' + nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz' + metagroot_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/metagroot/metagroot_test.hmm.gz' // Functional annotation interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan/interproscan_test.tar.gz' interproscan_applications = 'Hamap,TIGRFAM,sfld' diff --git a/conf/test_full.config b/conf/test_full.config index bcf1d96..966ee53 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,9 +17,10 @@ params { // Input data for full size test input = params.pipelines_testdata_base_path + 'proteinannotator/samplesheet/samplesheet.csv' // Domain annotation - pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' - funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' - nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz' + pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' + funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' + nmpfams_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/nmpfams/nmpfamsdb_test.hmm.gz' + metagroot_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/metagroot/metagroot_test.hmm.gz' // Functional annotation interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan_test.tar.gz' interproscan_applications = 'Hamap,TIGRFAM,sfld' diff --git a/docs/output.md b/docs/output.md index 0e6387f..85e3439 100644 --- a/docs/output.md +++ b/docs/output.md @@ -14,9 +14,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [SeqFu](#seqfu) for input amino acid sequences quality control (QC) - [SeqKit](#seqkit) for preprocessing input amino acid sequences (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) - [Database download](#database-download) Optionally download selected databases for annotation. - - [aria2](#aria2) - To optionally download the Pfam, FunFam, NMPFams and/or InterProScan databases through the pipeline. + - [aria2](#aria2) - To optionally download the Pfam, FunFam, NMPFams, metagRoot and/or InterProScan databases through the pipeline. - [Domain annotation](#domain-annotation) Annotate proteins with domains from established repositories. - - [hmmer](#hmmer) - To optionally match the input sequence to known Pfam, FunFam and/or NMPFams domains through `hmmer/hmmsearch` + - [hmmer](#hmmer) - To optionally match the input sequence to known Pfam, FunFam, NMPFams and/or metagRoot domains through `hmmer/hmmsearch` - [Functional annotation](#functional-annotation) Annotate proteins with functional domains - [InterProScan](#Interproscan) - Search the InterProScan database for functional domains - [s4pred](#s4pred) - Predict secondary structures of sequences, producing amino acid level probabilities of forming an α-helix, a β-strand or a coil. @@ -73,10 +73,11 @@ The `seqkit` module is used for initial preprocessing (i.e., gap removal, conver - `interproscan_test.tar.gz`: (optional) the downloaded InterProScan archive of member databases according to the optional user-provided url - `funfam-hmm3-v4_3_0*.lib.gz`: (optional) The latest (v4_3_0) full, or a minimal test, FunFam HMM database that can be downloaded through the pipeline. - `nmpfamsdb.hmm.gz`: (optional) The latest full, or a minimal test, NMPFams HMM database that can be downloaded through the pipeline. + - `metagroot.hmm.gz`: (optional) The latest full, or a minimal test, metagRoot HMM database that can be downloaded through the pipeline. -If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_nmpfams`, `skip_interproscan`) for each annotation database is set to `true`, or the `*_db` parameter paths (e.g., `pfam_db`, `funfam_db`, `nmpfams_db`, `interproscan_db`) are set (i.e., not `null`), or the run is resumed after a successful database download, then the respective database will not be (re)downloaded. The full database links can be found in the main `nextflow.config` file, while minimal test versions can be found in the `test` and `test_full` profiles (i.e., `conf/test.config`, `conf/test_full.config`). +If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_nmpfams`, `skip_metagroot`, `skip_interproscan`) for each annotation database is set to `true`, or the `*_db` parameter paths (e.g., `pfam_db`, `funfam_db`, `nmpfams_db`, `metagroot_db`, `interproscan_db`) are set (i.e., not `null`), or the run is resumed after a successful database download, then the respective database will not be (re)downloaded. The full database links can be found in the main `nextflow.config` file, while minimal test versions can be found in the `test` and `test_full` profiles (i.e., `conf/test.config`, `conf/test_full.config`). [aria2](https://github.com/aria2/aria2/) is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink. @@ -94,10 +95,12 @@ If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_nmpfams`, `skip_i - `.domtbl.gz`: `hmmer/hmmsearch` results along parameters info. - `nmpfams/` - `.domtbl.gz`: `hmmer/hmmsearch` results along parameters info. + - `metagroot/` + - `.domtbl.gz`: `hmmer/hmmsearch` results along parameters info. -Each of the `domain_annotation/` subfolders (e.g., `pfam`, `funfam`, `nmpfams`) contain a `.domtbl.gz` annotation file per input sample, depending on which domain annotation databases were used in the pipeline execution. +Each of the `domain_annotation/` subfolders (e.g., `pfam`, `funfam`, `nmpfams`, `metagroot`) contain a `.domtbl.gz` annotation file per input sample, depending on which domain annotation databases were used in the pipeline execution. [hmmer](https://github.com/EddyRivasLab/hmmer) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others. diff --git a/docs/usage.md b/docs/usage.md index 72d53cc..8978d1e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -7,7 +7,7 @@ ## Introduction **nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics and generates sequence-level annotations for amino acid sequences. -It takes a protein FASTA file as input and performs conserved domain annotation (using Pfam, FunFam and NMPFams HMM databases), functional annotation (using InterProScan), and secondary structure prediction (using s4pred). +It takes a protein FASTA file as input and performs conserved domain annotation (using Pfam, FunFam, NMPFams and metagRoot HMM databases), functional annotation (using InterProScan), and secondary structure prediction (using s4pred). Optionally, paths to pre-downloaded databases can be provided to skip the automatic download steps and speed up repeated runs. ## Samplesheet input diff --git a/main.nf b/main.nf index f9286d5..e29a062 100644 --- a/main.nf +++ b/main.nf @@ -49,6 +49,9 @@ workflow NFCORE_PROTEINANNOTATOR { params.skip_nmpfams, params.nmpfams_db, params.nmpfams_latest_link, + params.skip_metagroot, + params.metagroot_db, + params.metagroot_latest_link, params.skip_interproscan, params.interproscan_db_url, params.interproscan_db, diff --git a/nextflow.config b/nextflow.config index 3b9086a..b96d124 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,6 +28,9 @@ params { skip_nmpfams = false nmpfams_db = null nmpfams_latest_link = "https://pavlopoulos-lab.org/envofams/databases/hmmer/nmpfamsdb.hmm.gz" + skip_metagroot = false + metagroot_db = null + metagroot_latest_link = "https://pavlopoulos-lab.org/envofams/databases/hmmer/metagroot.hmm.gz" hmmsearch_evalue_cutoff = 0.001 // Functional annotation diff --git a/nextflow_schema.json b/nextflow_schema.json index 46a08a6..1d79c7a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -293,6 +293,23 @@ "default": "https://pavlopoulos-lab.org/envofams/databases/hmmer/nmpfamsdb.hmm.gz", "description": "" }, + "skip_metagroot": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip the domain annotation with the metagRoot database.", + "help": "Skips the domain annotation of input sequence against a metagRoot database." + }, + "metagroot_db": { + "type": "string", + "format": "file-path", + "description": "Path to an already installed metagRoot HMM database (.hmm.gz).", + "help_text": "If left null and skip_metagroot is false, the pipeline will start downloading the latest metagRoot HMM library." + }, + "metagroot_latest_link": { + "type": "string", + "default": "https://pavlopoulos-lab.org/envofams/databases/hmmer/metagroot.hmm.gz", + "description": "metagRoot hosted link to the latest available metagRoot HMM database file." + }, "hmmsearch_evalue_cutoff": { "type": "number", "default": 0.001, @@ -379,7 +396,6 @@ { "$ref": "#/$defs/domain_annotation_params" }, - { "$ref": "#/$defs/functional_annotation_options" }, diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 5028582..e3f06a1 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2026-02-09T13:54:13+00:00", - "description": "

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator)\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.18547735-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.18547735)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics for protein FASTA inputs and produces protein annotations based on predicted sequence features, including conserved domains, functions, and secondary structure.\n\n

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n### Check quality and pre-process\n\nGenerate input amino acid sequence statistics with ([`SeqFu`](https://github.com/telatin/seqfu2/)) and pre-process them (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) with ([`SeqKit`](https://github.com/shenwei356/seqkit/))\n\n### Annotate sequences\n\n1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases\n such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/), [FunFam](https://download.cathdb.info/cath/releases/all-releases/), and [NMPFams](https://pavlopoulos-lab.org/envofams/databases/hmmer/)\n2. Functional annotation:\n - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.\n3. Predict secondary structure compositional features such as \u03b1-helices, \u03b2-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred))\n4. Present QC stats for input sequences before and after initial pre-processing with ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nid,fasta\nspecies1,species1_proteins.fasta\nspecies2,species2_proteins.fasta\n```\n\nEach row represents a FASTA file of proteins from a single species.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinannotator \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinannotator/usage) and the [parameter documentation](https://nf-co.re/proteinannotator/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinannotator/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinannotator/output).\n\n## Credits\n\nnf-core/proteinannotator was originally written by Olga Botvinnik and Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Michael L Heuer](https://github.com/heuermh)\n- [Edmund Miller](https://github.com/edmundmiller)\n- [Eric Wei](https://github.com/eweizy)\n- [Martin Beracochea](https://github.com/mberacochea)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinannotator` channel](https://nfcore.slack.com/channels/proteinannotator) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/proteinannotator for your analysis, please cite it using the following doi: [10.5281/zenodo.18547735](https://doi.org/10.5281/zenodo.18547735)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator)\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.18547735-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.18547735)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics for protein FASTA inputs and produces protein annotations based on predicted sequence features, including conserved domains, functions, and secondary structure.\n\n

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n### Check quality and pre-process\n\nGenerate input amino acid sequence statistics with ([`SeqFu`](https://github.com/telatin/seqfu2/)) and pre-process them (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) with ([`SeqKit`](https://github.com/shenwei356/seqkit/))\n\n### Annotate sequences\n\n1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases\n such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/), [FunFam](https://download.cathdb.info/cath/releases/all-releases/), and [NMPFams and metagRoot](https://pavlopoulos-lab.org/envofams/databases/hmmer/)\n2. Functional annotation:\n - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.\n3. Predict secondary structure compositional features such as \u03b1-helices, \u03b2-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred))\n4. Present QC stats for input sequences before and after initial pre-processing with ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nid,fasta\nspecies1,species1_proteins.fasta\nspecies2,species2_proteins.fasta\n```\n\nEach row represents a FASTA file of proteins from a single species.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinannotator \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinannotator/usage) and the [parameter documentation](https://nf-co.re/proteinannotator/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinannotator/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinannotator/output).\n\n## Credits\n\nnf-core/proteinannotator was originally written by Olga Botvinnik and Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Michael L Heuer](https://github.com/heuermh)\n- [Edmund Miller](https://github.com/edmundmiller)\n- [Eric Wei](https://github.com/eweizy)\n- [Martin Beracochea](https://github.com/mberacochea)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinannotator` channel](https://nfcore.slack.com/channels/proteinannotator) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/proteinannotator for your analysis, please cite it using the following doi: [10.5281/zenodo.18547735](https://doi.org/10.5281/zenodo.18547735)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" diff --git a/subworkflows/local/domain_annotation/main.nf b/subworkflows/local/domain_annotation/main.nf index 467dbda..456d620 100644 --- a/subworkflows/local/domain_annotation/main.nf +++ b/subworkflows/local/domain_annotation/main.nf @@ -1,9 +1,11 @@ -include { ARIA2 as ARIA2_PFAM } from '../../../modules/nf-core/aria2/main' -include { ARIA2 as ARIA2_FUNFAM } from '../../../modules/nf-core/aria2/main' -include { ARIA2 as ARIA2_NMPFAMS } from '../../../modules/nf-core/aria2/main' -include { HMMER_HMMSEARCH as HMMSEARCH_PFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' -include { HMMER_HMMSEARCH as HMMSEARCH_FUNFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' -include { HMMER_HMMSEARCH as HMMSEARCH_NMPFAMS } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { ARIA2 as ARIA2_PFAM } from '../../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_FUNFAM } from '../../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_NMPFAMS } from '../../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_METAGROOT } from '../../../modules/nf-core/aria2/main' +include { HMMER_HMMSEARCH as HMMSEARCH_PFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMER_HMMSEARCH as HMMSEARCH_FUNFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMER_HMMSEARCH as HMMSEARCH_NMPFAMS } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMER_HMMSEARCH as HMMSEARCH_METAGROOT } from '../../../modules/nf-core/hmmer/hmmsearch/main' workflow DOMAIN_ANNOTATION { take: @@ -17,13 +19,17 @@ workflow DOMAIN_ANNOTATION { skip_nmpfams // boolean nmpfams_db // string nmpfams_latest_link // string + skip_metagroot // boolean + metagroot_db // string, path to the metagroot HMM database, if already exists + metagroot_latest_link // string, path to the latest metagroot HMM database, to download main: - ch_versions = channel.empty() - ch_pfam_domains = channel.empty() - ch_funfam_domains = channel.empty() - ch_nmpfams_domains = channel.empty() + ch_versions = channel.empty() + ch_pfam_domains = channel.empty() + ch_funfam_domains = channel.empty() + ch_nmpfams_domains = channel.empty() + ch_metagroot_domains = channel.empty() if (!skip_pfam) { if (!pfam_db) { @@ -85,9 +91,30 @@ workflow DOMAIN_ANNOTATION { ch_nmpfams_domains = HMMSEARCH_NMPFAMS.out.domain_summary } + if (!skip_metagroot) { + if (!metagroot_db) { + ch_metagroot_link = channel.of([ [ id: 'metagroot' ], metagroot_latest_link ]) + + ARIA2_METAGROOT( ch_metagroot_link ) + ch_versions = ch_versions.mix( ARIA2_METAGROOT.out.versions ) + ch_metagroot_db = ARIA2_METAGROOT.out.downloaded_file + } else { + ch_metagroot_db = channel.of([ [ id: 'metagroot' ], metagroot_db ]) + } + + ch_input_for_hmmsearch_metagroot = ch_fasta + .combine(ch_metagroot_db) + .map{ meta, seqs, _meta2, models -> [meta, models, seqs, false, false, true] } + + HMMSEARCH_METAGROOT( ch_input_for_hmmsearch_metagroot ) + ch_versions = ch_versions.mix( HMMSEARCH_METAGROOT.out.versions.first() ) + ch_metagroot_domains = HMMSEARCH_METAGROOT.out.domain_summary + } + emit: - pfam_domains = ch_pfam_domains - funfam_domains = ch_funfam_domains - nmpfams_domains = ch_nmpfams_domains - versions = ch_versions + pfam_domains = ch_pfam_domains + funfam_domains = ch_funfam_domains + nmpfams_domains = ch_nmpfams_domains + metagroot_domains = ch_metagroot_domains + versions = ch_versions } diff --git a/subworkflows/local/domain_annotation/meta.yml b/subworkflows/local/domain_annotation/meta.yml index 80f38ba..630237b 100644 --- a/subworkflows/local/domain_annotation/meta.yml +++ b/subworkflows/local/domain_annotation/meta.yml @@ -54,6 +54,18 @@ input: type: string description: | Path to the latest nmpfamsDB HMM database, to download + - skip_metagroot: + type: boolean + description: | + Skip domain annotation with metagRoot + - metagroot_db: + type: string + description: | + Path to an existing HMM metagRoot library on the system. If provided, the ARIA2_METAGROOT db download will be skipped. + - metagroot_latest_link: + type: string + description: | + Path to the latest metagRoot HMM database, to download output: - pfam_domains: type: file @@ -67,6 +79,10 @@ output: type: file description: | domtbl.gz files with nmpfams domain annotation for input amino acid sequences + - metagroot_domains: + type: file + description: | + domtbl.gz files with metagroot domain annotation for input amino acid sequences - versions: type: file description: | diff --git a/subworkflows/local/domain_annotation/tests/main.nf.test b/subworkflows/local/domain_annotation/tests/main.nf.test index 18030f4..0dc2598 100644 --- a/subworkflows/local/domain_annotation/tests/main.nf.test +++ b/subworkflows/local/domain_annotation/tests/main.nf.test @@ -22,6 +22,9 @@ nextflow_workflow { input[7] = true // skip_nmpfams input[8] = null // nmpfams_db input[9] = params.pipelines_testdata_base_path + '/testdata/nmpfams/nmpfamsdb_test.hmm.gz' // nmpfams_latest_link + input[10] = true // skip_metagroot + input[11] = null // metagroot_db + input[12] = params.pipelines_testdata_base_path + '/testdata/metagroot/metagroot_test.hmm.gz' // metagroot_latest_link """ } } @@ -56,6 +59,9 @@ nextflow_workflow { input[7] = true // skip_nmpfams input[8] = null // nmpfams_db input[9] = params.pipelines_testdata_base_path + '/testdata/nmpfams/nmpfamsdb_test.hmm.gz' // nmpfams_latest_link + input[10] = true // skip_metagroot + input[11] = null // metagroot_db + input[12] = params.pipelines_testdata_base_path + '/testdata/metagroot/metagroot_test.hmm.gz' // metagroot_latest_link """ } } @@ -89,6 +95,9 @@ nextflow_workflow { input[7] = false // skip_nmpfams input[8] = null // nmpfams_db input[9] = params.pipelines_testdata_base_path + '/testdata/nmpfams/nmpfamsdb_test.hmm.gz' // nmpfams_latest_link + input[10] = true // skip_metagroot + input[11] = null // metagroot_db + input[12] = params.pipelines_testdata_base_path + '/testdata/metagroot/metagroot_test.hmm.gz' // metagroot_latest_link """ } } @@ -104,6 +113,42 @@ nextflow_workflow { } } + test("faa - metagroot") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id: 'test' ], + file(params.pipelines_testdata_base_path + '/testdata/sequences/test_proteins.faa', checkIfExists: true) + ]) + input[1] = true // skip_pfam + input[2] = null // pfam_db + input[3] = params.pipelines_testdata_base_path + '/testdata/pfam/Pfam-A_test.hmm.gz' // pfam_latest_link + input[4] = true // skip_funfam + input[5] = null // funfam_db + input[6] = params.pipelines_testdata_base_path + '/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' // funfam_latest_link + input[7] = true // skip_nmpfams + input[8] = null // nmpfams_db + input[9] = params.pipelines_testdata_base_path + '/testdata/nmpfams/nmpfamsdb_test.hmm.gz' // nmpfams_latest_link + input[10] = false // skip_metagroot + input[11] = null // metagroot_db + input[12] = params.pipelines_testdata_base_path + '/testdata/metagroot/metagroot_test.hmm.gz' // metagroot_latest_link + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.metagroot_domains[0][1]).linesGzip[0..7], + workflow.out.versions.collect { path(it).yaml }.unique() + ).match()} + ) + } + } + test("faa - domain annotation - stub") { options "-stub" @@ -123,7 +168,10 @@ nextflow_workflow { input[6] = params.pipelines_testdata_base_path + '/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' // funfam_latest_link input[7] = false // skip_nmpfams input[8] = null // nmpfams_db - input[9] = params.pipelines_testdata_base_path + '/testdata/nmpfams/nmpfamsdb_test.hmm.gz' // nmpfams_latest_link + input[9] = params.pipelines_testdata_base_path + '/testdata/nmpfams/nmpfamsdb_test.hmm.gz' // nmpfams_latest_link + input[10] = false // skip_metagroot + input[11] = null // metagroot_db + input[12] = params.pipelines_testdata_base_path + '/testdata/metagroot/metagroot_test.hmm.gz' // metagroot_latest_link """ } } diff --git a/subworkflows/local/domain_annotation/tests/main.nf.test.snap b/subworkflows/local/domain_annotation/tests/main.nf.test.snap index 80ce69a..fd4f74e 100644 --- a/subworkflows/local/domain_annotation/tests/main.nf.test.snap +++ b/subworkflows/local/domain_annotation/tests/main.nf.test.snap @@ -1,4 +1,35 @@ { + "faa - metagroot": { + "content": [ + [ + "# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord", + "# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target", + "#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------", + "T1024 - 408 F101326 - 425 9.3e-13 34.9 26.2 1 1 1.8e-12 3.6e-12 33.0 26.2 13 351 18 340 12 407 0.74 LmrP, , 408 residues|", + "T1024 - 408 F226054 - 421 1.3e-13 37.4 26.4 1 1 8.6e-14 1.7e-13 37.0 26.4 2 404 2 404 1 408 0.73 LmrP, , 408 residues|", + "T1024 - 408 F240027 - 384 8.4e-10 25.0 5.2 1 1 8e-10 1.6e-09 24.1 5.2 26 163 26 160 6 178 0.88 LmrP, , 408 residues|", + "T1024 - 408 F287588 - 413 2e-10 26.9 23.3 1 1 1.6e-10 3.1e-10 26.3 23.3 48 363 42 370 30 406 0.74 LmrP, , 408 residues|", + "T1024 - 408 F294204 - 387 3.8e-06 12.8 25.9 1 1 2.8e-06 5.6e-06 12.3 25.9 16 372 41 406 30 408 0.76 LmrP, , 408 residues|" + ], + [ + { + "DOMAIN_ANNOTATION:HMMSEARCH_METAGROOT": { + "hmmer": 3.4 + } + }, + { + "DOMAIN_ANNOTATION:ARIA2_METAGROOT": { + "aria2": "1.36.0" + } + } + ] + ], + "timestamp": "2026-03-30T17:28:28.71093", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, "faa - domain annotation": { "content": [ [ @@ -44,7 +75,7 @@ } ] ], - "timestamp": "2026-03-13T14:51:37.636657", + "timestamp": "2026-03-30T17:28:01.729059", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" @@ -135,9 +166,19 @@ ] ], "3": [ + [ + { + "id": "test" + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,046e5161e3e1dff2ab111ddf4bb27331", "versions.yml:md5,160d4c5a5001cfb4ff57b94fc52b67d9", "versions.yml:md5,1b7d208e42364fb87160693faa4e83b9", "versions.yml:md5,35e41735706132967dd94bb636833a4a", + "versions.yml:md5,55939a7ab71dab922d448cf99472feeb", "versions.yml:md5,9045f482d64e7666e62932b0578b665e", "versions.yml:md5,a74a0c8fcb741e59bc14424f612b8d09", "versions.yml:md5,f1d8a406d3dcb97a7c15e9c810926de1" @@ -150,6 +191,14 @@ "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], + "metagroot_domains": [ + [ + { + "id": "test" + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], "nmpfams_domains": [ [ { @@ -167,16 +216,18 @@ ] ], "versions": [ + "versions.yml:md5,046e5161e3e1dff2ab111ddf4bb27331", "versions.yml:md5,160d4c5a5001cfb4ff57b94fc52b67d9", "versions.yml:md5,1b7d208e42364fb87160693faa4e83b9", "versions.yml:md5,35e41735706132967dd94bb636833a4a", + "versions.yml:md5,55939a7ab71dab922d448cf99472feeb", "versions.yml:md5,9045f482d64e7666e62932b0578b665e", "versions.yml:md5,a74a0c8fcb741e59bc14424f612b8d09", "versions.yml:md5,f1d8a406d3dcb97a7c15e9c810926de1" ] } ], - "timestamp": "2026-03-13T09:45:07.520815", + "timestamp": "2026-03-30T17:28:37.677345", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf index f5b753a..ded790e 100644 --- a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf @@ -180,7 +180,7 @@ def toolCitationText() { params.skip_preprocessing ? "" : "Input sequences were preprocessed with SeqKit (gap trimming, length filtering, validation, duplicate removal) (Shen et al. 2024)." ].join(' ').trim() - def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_nmpfams) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." + def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_nmpfams && params.skip_metagroot) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." def prediction_text = params.skip_s4pred ? "" : "Secondary structures were predicted via the s4pred software (Moffat et al. 2021)." @@ -202,7 +202,7 @@ def toolBibliographyText() { params.skip_preprocessing ? '' : '
  • Shen, W., Sipos, B., & Zhao, L. (2024). SeqKit2: A Swiss army knife for sequence and alignment processing. Imeta, 3(3), e191. doi: 10.1002/imt2.191
  • ' ].join(' ').trim() - def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_nmpfams) ? '' : '
  • Eddy, S. R. (2011). Accelerated profile HMM searches. PLoS computational biology, 7(10), e1002195. doi: 10.1371/journal.pcbi.1002195
  • ' + def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_nmpfams && params.skip_metagroot) ? '' : '
  • Eddy, S. R. (2011). Accelerated profile HMM searches. PLoS computational biology, 7(10), e1002195. doi: 10.1371/journal.pcbi.1002195
  • ' def prediction_text = params.skip_s4pred ? '' : '
  • Moffat, L., & Jones, D. T. (2021). Increasing the accuracy of single sequence prediction methods using a deep semi-supervised learning framework. Bioinformatics, 37(21), 3744-3751. doi: 10.1093/bioinformatics/btab491
  • ' diff --git a/tests/.nftignore b/tests/.nftignore index 0b6bd76..6441f81 100644 --- a/tests/.nftignore +++ b/tests/.nftignore @@ -17,6 +17,9 @@ domain_annotation/funfam/l_arginase.domtbl.gz domain_annotation/nmpfams/T1024.domtbl.gz domain_annotation/nmpfams/T1026.domtbl.gz domain_annotation/nmpfams/l_arginase.domtbl.gz +domain_annotation/metagroot/T1024.domtbl.gz +domain_annotation/metagroot/T1026.domtbl.gz +domain_annotation/metagroot/l_arginase.domtbl.gz functional_annotation/interproscan/T1024/T1024.gff3 functional_annotation/interproscan/T1024/T1024.tsv functional_annotation/interproscan/T1026/T1026.gff3 diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index 10d5d5a..48525fe 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -1,7 +1,7 @@ { "-profile test": { "content": [ - 36, + 40, { "ARIA2": { "aria2": "1.36.0" @@ -9,6 +9,9 @@ "ARIA2_FUNFAM": { "aria2": "1.36.0" }, + "ARIA2_METAGROOT": { + "aria2": "1.36.0" + }, "ARIA2_NMPFAMS": { "aria2": "1.36.0" }, @@ -18,6 +21,9 @@ "HMMSEARCH_FUNFAM": { "hmmer": 3.4 }, + "HMMSEARCH_METAGROOT": { + "hmmer": 3.4 + }, "HMMSEARCH_NMPFAMS": { "hmmer": 3.4 }, @@ -58,6 +64,10 @@ "domain_annotation/funfam/T1024.domtbl.gz", "domain_annotation/funfam/T1026.domtbl.gz", "domain_annotation/funfam/l_arginase.domtbl.gz", + "domain_annotation/metagroot", + "domain_annotation/metagroot/T1024.domtbl.gz", + "domain_annotation/metagroot/T1026.domtbl.gz", + "domain_annotation/metagroot/l_arginase.domtbl.gz", "domain_annotation/nmpfams", "domain_annotation/nmpfams/T1024.domtbl.gz", "domain_annotation/nmpfams/T1026.domtbl.gz", @@ -88,6 +98,7 @@ "downloaded_dbs/interproscan_db/tigrfam/15.0/TIGRFAMs_15.0_HMM.LIB", "downloaded_dbs/interproscan_db/tigrfam/15.0/TIGRFAMs_HMM.LIB", "downloaded_dbs/interproscan_test.tar.gz", + "downloaded_dbs/metagroot_test.hmm.gz", "downloaded_dbs/nmpfamsdb_test.hmm.gz", "functional_annotation", "functional_annotation/interproscan", @@ -193,6 +204,7 @@ "TIGRFAMs_15.0_HMM.LIB:md5,64f2b2c9e834b47b17d91bb9a6a0067e", "TIGRFAMs_HMM.LIB:md5,543da3f4b65eed9ec393986c6c6ff0ba", "interproscan_test.tar.gz:md5,cde88c0cd841c84dc1203e64854c762b", + "metagroot_test.hmm.gz:md5,d23de95bf39fb6e27ffb266ce61ac98e", "nmpfamsdb_test.hmm.gz:md5,ad7a094618ccfdaeed1c03e93f6abf1e", "T1024.json:md5,0288f7551a14faedc409dd374b3e073e", "T1024.xml:md5,63a3db0eb0e1f76403411602c23b721e", @@ -232,8 +244,8 @@ ], "meta": { "nf-test": "0.9.3", - "nextflow": "25.10.2" + "nextflow": "25.10.4" }, - "timestamp": "2026-03-14T10:06:42.466898492" + "timestamp": "2026-03-31T11:50:46.606922418" } } \ No newline at end of file diff --git a/workflows/proteinannotator.nf b/workflows/proteinannotator.nf index b0032f2..a0d99fa 100644 --- a/workflows/proteinannotator.nf +++ b/workflows/proteinannotator.nf @@ -21,21 +21,24 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_prot workflow PROTEINANNOTATOR { take: - ch_samplesheet // channel: samplesheet read in from --input - skip_preprocessing // boolean - skip_pfam // boolean - pfam_db // string, path to the pfam HMM database, if already exists - pfam_latest_link // string, path to the latest pfam HMM database, to download - skip_funfam // boolean - funfam_db // string, path to the pfam HMM database, if already exists - funfam_latest_link // string, path to the latest pfam HMM database, to download - skip_nmpfams // boolean - nmpfams_db // string - nmpfams_latest_link // string - skip_interproscan // boolean - interproscan_db_url // string, url to download db - interproscan_db // string, existing db - skip_s4pred // boolean + ch_samplesheet // channel: samplesheet read in from --input + skip_preprocessing // boolean + skip_pfam // boolean + pfam_db // string, path to the pfam HMM database, if already exists + pfam_latest_link // string, path to the latest pfam HMM database, to download + skip_funfam // boolean + funfam_db // string, path to the pfam HMM database, if already exists + funfam_latest_link // string, path to the latest pfam HMM database, to download + skip_nmpfams // boolean + nmpfams_db // string + nmpfams_latest_link // string + skip_metagroot // boolean + metagroot_db // string, path to the metagroot HMM database, if already exists + metagroot_latest_link // string, path to the latest metagroot HMM database, to download + skip_interproscan // boolean + interproscan_db_url // string, url to download db + interproscan_db // string, existing db + skip_s4pred // boolean main: @@ -55,7 +58,10 @@ workflow PROTEINANNOTATOR { funfam_latest_link, skip_nmpfams, nmpfams_db, - nmpfams_latest_link + nmpfams_latest_link, + skip_metagroot, + metagroot_db, + metagroot_latest_link ) ch_versions = ch_versions.mix( DOMAIN_ANNOTATION.out.versions )