diff --git a/conf/modules.config b/conf/modules.config index ec1428c..9f40a6d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -90,6 +90,17 @@ process { ] } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:WGET_MROOT' { + ext.prefix = "HMM" + ext.suffix = "tar.gz" + ext.args = '--no-check-certificate' // explicitly naming output + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_PFAM' { ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } publishDir = [ @@ -110,6 +121,17 @@ process { ] } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_MROOT' { + ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } + publishDir = [ + path: { "${params.outdir}/domain_annotation/mroot/" }, + mode: params.publish_dir_mode, + pattern: "*.domtbl.gz", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:ARIA2' { publishDir = [ path: { "${params.outdir}/downloaded_dbs/" }, diff --git a/conf/test.config b/conf/test.config index 252ec87..02c92be 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,6 +27,7 @@ params { // Domain annotation pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' + mroot_latest_link = 'https://pavlopoulos-lab.org/metagroot/DownloadHmm' // Functional annotation interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan/interproscan_test.tar.gz' interproscan_applications = 'Hamap,TIGRFAM,sfld' diff --git a/main.nf b/main.nf index 98d7d67..d7f1972 100644 --- a/main.nf +++ b/main.nf @@ -46,6 +46,9 @@ workflow NFCORE_PROTEINANNOTATOR { params.skip_funfam, params.funfam_db, params.funfam_latest_link, + params.skip_mroot, + params.mroot_db, + params.mroot_latest_link, params.skip_interproscan, params.interproscan_db_url, params.interproscan_db, diff --git a/modules.json b/modules.json index 37ba5b8..6a31b58 100644 --- a/modules.json +++ b/modules.json @@ -59,6 +59,11 @@ "branch": "master", "git_sha": "447f7bc0fa41dfc2400c8cad4c0291880dc060cf", "installed_by": ["modules"] + }, + "wget": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/wget/environment.yml b/modules/nf-core/wget/environment.yml new file mode 100644 index 0000000..9eb304e --- /dev/null +++ b/modules/nf-core/wget/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::wget=1.21.4 diff --git a/modules/nf-core/wget/main.nf b/modules/nf-core/wget/main.nf new file mode 100644 index 0000000..9bc6f15 --- /dev/null +++ b/modules/nf-core/wget/main.nf @@ -0,0 +1,48 @@ +process WGET { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3b/3b54fa9135194c72a18d00db6b399c03248103f87e43ca75e4b50d61179994b3/data': + 'community.wave.seqera.io/library/wget:1.21.4--8b0fcde81c17be5e' }" + + input: + tuple val(meta), val(url) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: outfile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: 'html' + """ + wget \\ + -O - \\ + $args \\ + $url \\ + > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -1 | cut -d ' ' -f 3) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: 'html' + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -1 | cut -d ' ' -f 3) + END_VERSIONS + """ +} diff --git a/modules/nf-core/wget/meta.yml b/modules/nf-core/wget/meta.yml new file mode 100644 index 0000000..56df0af --- /dev/null +++ b/modules/nf-core/wget/meta.yml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "wget" +description: The non-interactive network downloader +keywords: + - "wget" + - "download" + - "network" +tools: + - "wget": + description: "wget is a free utility for non-interactive download of files from + the Web." + homepage: "https://www.gnu.org/software/wget/" + documentation: "https://www.gnu.org/software/wget/manual/wget.html" + licence: ["GPL"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - url: + type: string + description: URL to download + pattern: "^https?://*.*" + +output: + outfile: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}.${suffix}: + type: file + description: Downloaded file + pattern: "*.*" + + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@itrujnara" +maintainers: + - "@itrujnara" diff --git a/modules/nf-core/wget/tests/main.nf.test b/modules/nf-core/wget/tests/main.nf.test new file mode 100644 index 0000000..e094288 --- /dev/null +++ b/modules/nf-core/wget/tests/main.nf.test @@ -0,0 +1,62 @@ +// nf-core modules test wget +nextflow_process { + + name "Test Process WGET" + script "../main.nf" + process "WGET" + + tag "modules" + tag "modules_nfcore" + tag "wget" + + test("sarscov2 - gff") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/modules/data/genomics/sarscov2/genome/genome.gff3", + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - gff - stub") { + + options "-stub" + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/modules/data/genomics/sarscov2/genome/genome.gff3", + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/wget/tests/main.nf.test.snap b/modules/nf-core/wget/tests/main.nf.test.snap new file mode 100644 index 0000000..6c05160 --- /dev/null +++ b/modules/nf-core/wget/tests/main.nf.test.snap @@ -0,0 +1,70 @@ +{ + "sarscov2 - gff": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gff3:md5,357435a81a9981a0128e840ebe11051e" + ] + ], + "1": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ], + "outfile": [ + [ + { + "id": "test" + }, + "test.gff3:md5,357435a81a9981a0128e840ebe11051e" + ] + ], + "versions": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-26T12:27:32.67617" + }, + "sarscov2 - gff - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ], + "outfile": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-26T12:21:06.414955" + } +} \ No newline at end of file diff --git a/modules/nf-core/wget/tests/nextflow.config b/modules/nf-core/wget/tests/nextflow.config new file mode 100644 index 0000000..236f4e1 --- /dev/null +++ b/modules/nf-core/wget/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: "WGET" { + ext.prefix = "test" + ext.suffix = "gff3" + } +} diff --git a/nextflow.config b/nextflow.config index e56f91f..f83d29d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,6 +25,9 @@ params { skip_funfam = false funfam_db = null funfam_latest_link = "https://download.cathdb.info/cath/releases/all-releases/v4_3_0/sequence-data/funfam-hmm3-v4_3_0.lib.gz" + skip_mroot = false + mroot_db = null + mroot_latest_link = "https://pavlopoulos-lab.org/metagroot/DownloadHmm" hmmsearch_evalue_cutoff = 0.001 // Functional annotation diff --git a/nextflow_schema.json b/nextflow_schema.json index b7ad6d8..754cd6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -276,6 +276,23 @@ "default": "https://download.cathdb.info/cath/releases/all-releases/v4_3_0/sequence-data/funfam-hmm3-v4_3_0.lib.gz", "description": "CATH hosted link to the latest available (v4_3_0) FunFam HMM database file." }, + "skip_mroot": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip the domain annotation with the MetagRoot database.", + "help": "Skips the domain annotation of input sequence against a MetagRoot database." + }, + "mroot_db": { + "type": "string", + "format": "file-path", + "description": "Path to an already installed MetagRoot HMM database (.tar.gz).", + "help_text": "If left null and skip_mroot is false, the pipeline will start downloading the latest MetagRoot HMM library." + }, + "mroot_latest_link": { + "type": "string", + "default": "https://pavlopoulos-lab.org/metagroot/DownloadHmm", + "description": "MetagRoot hosted link to the latest available MetagRoot HMM database file." + }, "hmmsearch_evalue_cutoff": { "type": "number", "default": 0.001, diff --git a/subworkflows/local/domain_annotation/main.nf b/subworkflows/local/domain_annotation/main.nf index 1ec8289..4e76a56 100644 --- a/subworkflows/local/domain_annotation/main.nf +++ b/subworkflows/local/domain_annotation/main.nf @@ -1,7 +1,10 @@ include { ARIA2 as ARIA2_PFAM } from '../../../modules/nf-core/aria2/main' include { ARIA2 as ARIA2_FUNFAM } from '../../../modules/nf-core/aria2/main' +include { WGET as WGET_MROOT } from '../../../modules/nf-core/wget/main' include { HMMER_HMMSEARCH as HMMSEARCH_PFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' include { HMMER_HMMSEARCH as HMMSEARCH_FUNFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMER_HMMSEARCH as HMMSEARCH_MROOT } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { UNTAR as UNTAR_MROOT } from '../../../modules/nf-core/untar/main' workflow DOMAIN_ANNOTATION { take: @@ -12,12 +15,16 @@ workflow DOMAIN_ANNOTATION { skip_funfam // boolean funfam_db // string, path to the funfam HMM database, if already exists funfam_latest_link // string, path to the latest funfam HMM database, to download + skip_mroot // boolean + mroot_db // string, path to the metagroot HMM database, if already exists + mroot_latest_link // string, path to the latest metagroot HMM database, to download main: ch_versions = channel.empty() ch_pfam_domains = channel.empty() ch_funfam_domains = channel.empty() + ch_mroot_domains = channel.empty() if (!skip_pfam) { if (!pfam_db) { @@ -59,8 +66,37 @@ workflow DOMAIN_ANNOTATION { ch_funfam_domains = HMMSEARCH_FUNFAM.out.domain_summary } + if (!skip_mroot) { + if (!mroot_db) { + ch_mroot_link = channel.of([ [ id: 'mroot' ], mroot_latest_link ]) + // download file from url + WGET_MROOT( ch_mroot_link ) + // untar file if its a tar.gz + UNTAR_MROOT( WGET_MROOT.out.outfile ) + // extract hmm files from dir + ch_mroot_db = UNTAR_MROOT.out.untar + .map { + meta, dir -> + // collect all .hmm files from dir + def hmm_files = file("${dir}/**/*.hmm") + tuple(meta, hmm_files) + } + } else { + ch_mroot_db = channel.of([ [ id: 'mroot' ], mroot_db ]) + } + + ch_input_for_hmmsearch_mroot = ch_fasta + .combine(ch_mroot_db) + .map{ meta, seqs, _meta2, models -> [meta, models, seqs, false, false, true] } + + HMMSEARCH_MROOT( ch_input_for_hmmsearch_mroot ) + ch_versions = ch_versions.mix( HMMSEARCH_MROOT.out.versions.first() ) + ch_mroot_domains = HMMSEARCH_MROOT.out.domain_summary + } + emit: pfam_domains = ch_pfam_domains funfam_domains = ch_funfam_domains + mroot_domains = ch_mroot_domains versions = ch_versions } diff --git a/subworkflows/local/domain_annotation/meta.yml b/subworkflows/local/domain_annotation/meta.yml index e04e241..ad37de9 100644 --- a/subworkflows/local/domain_annotation/meta.yml +++ b/subworkflows/local/domain_annotation/meta.yml @@ -42,6 +42,18 @@ input: type: string description: | Path to the latest FunFam HMM database, to download + - skip_mroot: + type: boolean + description: | + Skip domain annotation with MetagRoot + - mroot_db: + type: string + description: | + Path to an existing HMM MetagRoot library on the system. If provided, the ARIA2_METAGROOT db download will be skipped. + - mroot_latest_link: + type: string + description: | + Path to the latest MetagRoot HMM database, to download output: - pfam_domains: type: file @@ -51,6 +63,10 @@ output: type: file description: | domtbl.gz files with funfam domain annotation for input amino acid sequences + - mroot_domains: + type: file + description: | + domtbl.gz files with metagroot domain annotation for input amino acid sequences - versions: type: file description: | diff --git a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf index 1ba3ccc..7ef2d1a 100644 --- a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf @@ -180,7 +180,7 @@ def toolCitationText() { params.skip_preprocessing ? "" : "Input sequences were preprocessed with SeqKit (gap trimming, length filtering, validation, duplicate removal) (Shen et al. 2024)." ].join(' ').trim() - def domain_annotation_text = (params.skip_pfam && params.skip_funfam) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." + def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_mroot) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." def prediction_text = params.skip_s4pred ? "" : "Secondary structures were predicted via the s4pred software (Moffat et al. 2021)." @@ -202,7 +202,7 @@ def toolBibliographyText() { params.skip_preprocessing ? '' : '