diff --git a/CHANGELOG.md b/CHANGELOG.md index 34faf3bc..93621036 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 1. Added the samplesheet to the pipeline output as `OUTDIR/samplesheet.csv` 2. Added the `--bedpe` parameter. This makes the pipeline output BEDPE files alongside the VCF files. 3. Added parallelization on SV type to the delly flow +4. Added a `--gtf` parameter for annotation of gene and transcript overlap using `gatk SVAnnotate`. ### `Changes` diff --git a/assets/svync/delly.yaml b/assets/svync/delly.yaml index 55d1c5d8..45f4ca12 100644 --- a/assets/svync/delly.yaml +++ b/assets/svync/delly.yaml @@ -1,6 +1,8 @@ id: delly_$INFO/SVTYPE alt: - BND: TRA + alts: + BND: + value: <$INFO/SVTYPE> info: CALLERS: value: delly diff --git a/assets/svync/manta.yaml b/assets/svync/manta.yaml index d081ef29..440618a4 100644 --- a/assets/svync/manta.yaml +++ b/assets/svync/manta.yaml @@ -1,4 +1,6 @@ id: manta_$INFO/SVTYPE +alt: + value: <$INFO/SVTYPE> info: CALLERS: value: manta diff --git a/bin/preprocess_gtf.py b/bin/preprocess_gtf.py new file mode 100755 index 00000000..c252cd05 --- /dev/null +++ b/bin/preprocess_gtf.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# https://github.com/broadinstitute/gatk-sv/blob/main/scripts/inputs/preprocess_gtf.py + +""" +Preprocess GENCODE basic GTF to extract canonical protein-coding transcripts for functional consequence annotation. +""" + +import argparse +import gzip + + +CHROM_FIELD = 0 +ELEMENT_FIELD = 2 +ATTRIBUTES_FIELD = 8 +TRANSCRIPT_TYPES = {"protein_coding", "nonsense_mediated_decay"} +CANONICAL = {"MANE_Plus_Clinical", "MANE_Select", "Ensembl_canonical"} + + +# Flexibly open .gz or uncompressed file to read +def _open(filename): + if filename.endswith(".gz"): + return gzip.open(filename, 'rt') + else: + return open(filename, 'r') + + +# Extract transcript type and canonical status +def parse_attributes(field): + # format: key1 "value1"; key2 "value2"; + # keys may be repeated so cannot convert directly to dictionary + attributes_list = [tuple(x.replace('"', '').split(' ')) for x in field.rstrip(";").split("; ")] + protein = False + canonical = False + for key, val in attributes_list: + if key == "tag" and val in CANONICAL: + canonical = True + elif key == "transcript_type" and val in TRANSCRIPT_TYPES: + protein = True + return protein, canonical + + +def process(gtf, outfile): + with _open(gtf) as inp, open(outfile, 'w') as out: + gene_line = "" + for line in inp: + if line.startswith("#"): + continue + fields = line.rstrip('\n').split('\t') + + # Drop mitochondria + if fields[CHROM_FIELD] == 'chrM': + continue + + # Store gene line to print if transcript is eligible + if fields[ELEMENT_FIELD] == "gene": + gene_line = line + continue + + # Select protein-coding and canonical transcripts only + protein, canonical = parse_attributes(fields[ATTRIBUTES_FIELD]) + if protein and canonical: + out.write(gene_line + line) + gene_line = "" # only print gene line before first transcript line + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('gtf', help="Input GTF from GENCODE") + parser.add_argument('outfile', help="Output filename") + args = parser.parse_args() + + process(args.gtf, args.outfile) + + +if __name__ == '__main__': + main() diff --git a/conf/modules.config b/conf/modules.config index ed24085e..378a340d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -259,6 +259,10 @@ process { ext.args = "-ends" } + withName: "^.*GATK4_SVANNOTATE\$" { + ext.prefix = {"${meta.id}.${meta.variant_type}.svannotate"} + } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SV AND CNV FILTERING diff --git a/conf/test.config b/conf/test.config index 3d807df7..a9672f77 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,6 +33,8 @@ params { // Fasta references fasta = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta" fai = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta.fai" + dict = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.dict" + gtf = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.gtf" // bwa = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/bwa.tar.gz" expansionhunter_catalog = params.test_data["homo_sapiens"]["genome"]["expansionhunter"] qdnaseq_male = params.test_data["homo_sapiens"]["genome"]["genome_qdnaseq"] diff --git a/main.nf b/main.nf index 065e57f2..945f1d6f 100644 --- a/main.nf +++ b/main.nf @@ -28,6 +28,8 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_stru params.fasta = getGenomeAttribute('fasta') params.fai = getGenomeAttribute('fai') +params.dict = getGenomeAttribute('dict') +params.gtf = getGenomeAttribute('gtf') params.vep_cache = getGenomeAttribute('vep_cache') // params.bwa = getGenomeAttribute('bwa') params.annotsv_annotations = getGenomeAttribute('annotsv_annotations') @@ -81,6 +83,8 @@ workflow { // files params.fasta, params.fai, + params.dict, + params.gtf, params.expansionhunter_catalog ?: "https://github.com/Illumina/ExpansionHunter/raw/master/variant_catalog/grch38/variant_catalog.json", params.qdnaseq_female, params.qdnaseq_male, diff --git a/modules.json b/modules.json index f21c7565..3a02a14a 100644 --- a/modules.json +++ b/modules.json @@ -7,12 +7,12 @@ "nf-core": { "annotsv/annotsv": { "branch": "master", - "git_sha": "a94ad45fa5b350961c374c46f79bc86cd5853353", + "git_sha": "296d216c3f6384936a6526b6fbed7e6412259fb4", "installed_by": ["modules"] }, "annotsv/installannotations": { "branch": "master", - "git_sha": "a94ad45fa5b350961c374c46f79bc86cd5853353", + "git_sha": "296d216c3f6384936a6526b6fbed7e6412259fb4", "installed_by": ["modules"] }, "bcftools/annotate": { @@ -66,6 +66,16 @@ "git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5", "installed_by": ["modules"] }, + "gatk4/createsequencedictionary": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "gatk4/svannotate": { + "branch": "master", + "git_sha": "cc7e281e7877146dac79c5a484e6e2b10086234a", + "installed_by": ["modules"] + }, "gawk": { "branch": "master", "git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5", @@ -135,7 +145,7 @@ }, "svync": { "branch": "master", - "git_sha": "916a4cbc4f831d501860495b157c4857833e22a7", + "git_sha": "0fc190096fa8dcc9878cef178479f22e03f174a1", "installed_by": ["modules"] }, "tabix/bgziptabix": { diff --git a/modules/local/preprocess_gtf/environment.yml b/modules/local/preprocess_gtf/environment.yml new file mode 100644 index 00000000..519d3a96 --- /dev/null +++ b/modules/local/preprocess_gtf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.5 diff --git a/modules/local/preprocess_gtf/main.nf b/modules/local/preprocess_gtf/main.nf new file mode 100644 index 00000000..d140ef8a --- /dev/null +++ b/modules/local/preprocess_gtf/main.nf @@ -0,0 +1,40 @@ +process PREPROCESS_GTF { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8a/8ad257d53c2a2b8810d2b12d4d8e3ea438bc8c4a6be7c39b0354cd7bb8d5c260/data': + 'community.wave.seqera.io/library/python:3.13.5--18032a8dc5d4b91e' }" + + input: + tuple val(meta), path(gtf) + + output: + tuple val(meta), path("*.sanitized.gtf"), emit: gtf + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${gtf.baseName}" + + """ + preprocess_gtf.py $gtf ${prefix}.sanitized.gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${gtf.baseName}" + + """ + touch ${prefix}.sanitized.gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/annotsv/annotsv/annotsv-annotsv.diff b/modules/nf-core/annotsv/annotsv/annotsv-annotsv.diff deleted file mode 100644 index a9c3fe23..00000000 --- a/modules/nf-core/annotsv/annotsv/annotsv-annotsv.diff +++ /dev/null @@ -1,45 +0,0 @@ -Changes in module 'nf-core/annotsv/annotsv' ---- modules/nf-core/annotsv/annotsv/main.nf -+++ modules/nf-core/annotsv/annotsv/main.nf -@@ -17,7 +17,7 @@ - output: - tuple val(meta), path("*.tsv") , emit: tsv - tuple val(meta), path("*.unannotated.tsv") , emit: unannotated_tsv, optional: true -- tuple val(meta), path("*.vcf") , emit: vcf, optional: true -+ tuple val(meta), path("*.vcf") , emit: vcf - path "versions.yml" , emit: versions - - when: -@@ -39,11 +39,15 @@ - ${small_variants} \\ - ${fp_snv} \\ - ${transcripts} \\ -- -outputFile ${prefix}.tsv \\ -+ -outputFile ${prefix}.raw.tsv \\ - -SVinputFile ${sv_vcf} \\ - ${args} - - mv *_AnnotSV/* . -+ awk 'BEGIN { FS=OFS="\t" } { if (NR > 1 && NF >= 8) \$1 = \$1 "_" NR; print }' ${prefix}.raw.tsv > ${prefix}.tsv -+ -+ variantconvert convert -i ${prefix}.tsv -o ${prefix}.vcf -fi annotsv -fo vcf -c /usr/local/share/python3/variantconvert/configs/GRCh38/annotsv3_from_vcf.json -+ sed -i 's/contig= versions.yml - "${task.process}": -@@ -55,12 +59,10 @@ - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - -- def create_vcf = args.contains("-vcf 1") ? "touch ${prefix}.vcf" : "" -- - """ - touch ${prefix}.tsv - touch ${prefix}.unannotated.tsv -- ${create_vcf} -+ touch ${prefix}.vcf - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - -************************************************************ diff --git a/modules/nf-core/annotsv/annotsv/environment.yml b/modules/nf-core/annotsv/annotsv/environment.yml index e5f4a3e4..d8f5ae51 100644 --- a/modules/nf-core/annotsv/annotsv/environment.yml +++ b/modules/nf-core/annotsv/annotsv/environment.yml @@ -1,7 +1,7 @@ -name: annotsv_annotsv +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::annotsv=3.4.2 + - bioconda::annotsv=3.4.6 diff --git a/modules/nf-core/annotsv/annotsv/main.nf b/modules/nf-core/annotsv/annotsv/main.nf index e9d47db7..63eeab67 100644 --- a/modules/nf-core/annotsv/annotsv/main.nf +++ b/modules/nf-core/annotsv/annotsv/main.nf @@ -4,8 +4,11 @@ process ANNOTSV_ANNOTSV { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/annotsv:3.4.2--141a0ee560de1897' : - 'community.wave.seqera.io/library/annotsv:3.4.2--010fa21247b5b64b' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/df/df5e87ce610d595afe5f1e4017c255e53590cfa9b156c43310d637b36bfd12b3/data' : + 'community.wave.seqera.io/library/annotsv:3.4.6--c5a6da1bf4c2c8e5' }" + + // Container options are needed to allow AnnotSV to overwrite a file in a dependency directory in Singularity + containerOptions "${ workflow.containerEngine == 'singularity' ? '--writable-tmpfs' : ''}" input: tuple val(meta), path(sv_vcf), path(sv_vcf_index), path(candidate_small_variants) @@ -15,10 +18,10 @@ process ANNOTSV_ANNOTSV { tuple val(meta5), path(gene_transcripts) output: - tuple val(meta), path("*.tsv") , emit: tsv - tuple val(meta), path("*.unannotated.tsv") , emit: unannotated_tsv, optional: true - tuple val(meta), path("*.vcf") , emit: vcf, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("*.unannotated.tsv"), emit: unannotated_tsv, optional: true + tuple val(meta), path("*.vcf") , emit: vcf , optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -27,10 +30,10 @@ process ANNOTSV_ANNOTSV { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def cand_genes = candidate_genes ? "-candidateGenesFile ${candidate_genes}" : "" + def cand_genes = candidate_genes ? "-candidateGenesFile ${candidate_genes}" : "" def small_variants = candidate_small_variants ? "-candidateSnvIndelFiles ${candidate_small_variants}" : "" - def fp_snv = false_positive_snv ? "-snvIndelFiles ${false_positive_snv}" : "" - def transcripts = gene_transcripts ? "-txFile ${gene_transcripts}" : "" + def fp_snv = false_positive_snv ? "-snvIndelFiles ${false_positive_snv}" : "" + def transcripts = gene_transcripts ? "-txFile ${gene_transcripts}" : "" """ AnnotSV \\ @@ -52,14 +55,12 @@ process ANNOTSV_ANNOTSV { """ stub: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def create_vcf = args.contains("-vcf 1") ? "touch ${prefix}.vcf" : "" """ - echo "$args" - touch ${prefix}.tsv touch ${prefix}.unannotated.tsv ${create_vcf} diff --git a/modules/nf-core/annotsv/annotsv/meta.yml b/modules/nf-core/annotsv/annotsv/meta.yml index f690f0aa..1ba4feaf 100644 --- a/modules/nf-core/annotsv/annotsv/meta.yml +++ b/modules/nf-core/annotsv/annotsv/meta.yml @@ -14,80 +14,118 @@ tools: tool_dev_url: "https://github.com/lgmgeo/AnnotSV" doi: 10.1093/bioinformatics/bty304 licence: ["GPL-3.0"] + identifier: biotools:AnnotSV input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - sv_vcf: - type: file - description: A VCF or BED file containing the structural variants to be annotated - pattern: "*.{bed,vcf,vcf.gz}" - - sv_vcf_index: - type: file - description: OPTIONAL - The index for gzipped VCF files - pattern: "*.tbi" - - candidate_small_variants: - type: file - description: OPTIONAL - A file containing candidate small variants - pattern: "*.{vcf,vcf.gz}" - - meta2: - type: map - description: | - Groovy Map containing annotations information - - annotations: - type: directory - description: | - The directory containing the annotations (URL to download this will be made available soon) - For now this can be downloaded in the way defined in the repo (https://github.com/lgmgeo/AnnotSV#quick-installation) - - meta3: - type: map - description: | - Groovy Map containing candidate genes information - - candidate_genes: - type: file - description: OPTIONAL - A file containing genes (either space-separated, tab-separated or line-break-separated) - pattern: "*.txt" - - meta4: - type: map - description: | - Groovy Map containing candidate false positive SNV information - - false_positive_snv: - type: file - description: OPTIONAL - A VCF file containing small variant candidates - pattern: "*.{vcf,vcf.gz}" - - meta5: - type: map - description: | - Groovy Map containing candidate gene transcripts information - - gene_transcripts: - type: file - description: OPTIONAL - A file containing the preferred gene transcripts to be used in priority during annotation (either space-separated or tab-separated) - pattern: "*.txt" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sv_vcf: + type: file + description: A VCF or BED file containing the structural variants to be annotated + pattern: "*.{bed,vcf,vcf.gz}" + ontologies: + - edam: "http://edamontology.org/format_3003" # BED + - edam: "http://edamontology.org/format_3016" # VCF + - sv_vcf_index: + type: file + description: OPTIONAL - The index for gzipped VCF files + pattern: "*.tbi" + ontologies: + - edam: "http://edamontology.org/format_3475" # Tabix index + - candidate_small_variants: + type: file + description: OPTIONAL - A file containing candidate small variants + pattern: "*.{vcf,vcf.gz}" + ontologies: + - edam: "http://edamontology.org/format_3016" # VCF + - - meta2: + type: map + description: | + Groovy Map containing annotations information + - annotations: + type: directory + description: | + The directory containing the annotations (URL to download this will be made available soon) + For now this can be downloaded in the way defined in the repo (https://github.com/lgmgeo/AnnotSV#quick-installation) + - - meta3: + type: map + description: | + Groovy Map containing candidate genes information + - candidate_genes: + type: file + description: OPTIONAL - A file containing genes (either space-separated, tab-separated + or line-break-separated) + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_2330" # Text file + - - meta4: + type: map + description: | + Groovy Map containing candidate false positive SNV information + - false_positive_snv: + type: file + description: OPTIONAL - A VCF file containing small variant candidates + pattern: "*.{vcf,vcf.gz}" + ontologies: + - edam: "http://edamontology.org/format_3016" # VCF + - - meta5: + type: map + description: | + Groovy Map containing candidate gene transcripts information + - gene_transcripts: + type: file + description: OPTIONAL - A file containing the preferred gene transcripts to + be used in priority during annotation (either space-separated or tab-separated) + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_2330" # Text file output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - tsv: - type: file - description: A TSV file containing the annotated variants - pattern: "*.tsv" - - unannotated_tsv: - type: file - description: OPTIONAL - TSV file containing the unannotated variants - pattern: "*.unannotated.tsv" - - vcf: - type: file - description: | - OPTIONAL - A VCF file containing the annotated variants (created when `-vcf 1` is specified in the args) - pattern: "*.vcf" + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tsv": + type: file + description: A TSV file containing the annotated variants + pattern: "*.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + unannotated_tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.unannotated.tsv": + type: file + description: OPTIONAL - TSV file containing the unannotated variants + pattern: "*.unannotated.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + vcf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.vcf": + type: file + description: | + OPTIONAL - A VCF file containing the annotated variants (created when `-vcf 1` is specified in the args) + pattern: "*.vcf" + ontologies: + - edam: "http://edamontology.org/format_3016" # VCF + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML authors: - "@nvnieuwk" maintainers: diff --git a/modules/nf-core/annotsv/annotsv/tests/main.nf.test.snap b/modules/nf-core/annotsv/annotsv/tests/main.nf.test.snap index fd46d7e7..53516807 100644 --- a/modules/nf-core/annotsv/annotsv/tests/main.nf.test.snap +++ b/modules/nf-core/annotsv/annotsv/tests/main.nf.test.snap @@ -25,7 +25,7 @@ ], "3": [ - "versions.yml:md5,a5c7d9d19db00a62006faa1bafa917ec" + "versions.yml:md5,8538f3187cc91433cef9d3db1850fbb1" ], "tsv": [ [ @@ -50,14 +50,14 @@ ], "versions": [ - "versions.yml:md5,a5c7d9d19db00a62006faa1bafa917ec" + "versions.yml:md5,8538f3187cc91433cef9d3db1850fbb1" ] } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.1" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2024-05-29T15:10:39.33144868" + "timestamp": "2025-07-10T16:46:10.783184639" } } \ No newline at end of file diff --git a/modules/nf-core/annotsv/annotsv/tests/tags.yml b/modules/nf-core/annotsv/annotsv/tests/tags.yml deleted file mode 100644 index 54453426..00000000 --- a/modules/nf-core/annotsv/annotsv/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -annotsv/annotsv: - - "modules/nf-core/annotsv/annotsv/**" diff --git a/modules/nf-core/annotsv/installannotations/environment.yml b/modules/nf-core/annotsv/installannotations/environment.yml index b759f91d..d8f5ae51 100644 --- a/modules/nf-core/annotsv/installannotations/environment.yml +++ b/modules/nf-core/annotsv/installannotations/environment.yml @@ -1,7 +1,7 @@ -name: annotsv_installannotations +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::annotsv=3.4.2 + - bioconda::annotsv=3.4.6 diff --git a/modules/nf-core/annotsv/installannotations/main.nf b/modules/nf-core/annotsv/installannotations/main.nf index 371e1bbf..147bbc12 100644 --- a/modules/nf-core/annotsv/installannotations/main.nf +++ b/modules/nf-core/annotsv/installannotations/main.nf @@ -4,8 +4,8 @@ process ANNOTSV_INSTALLANNOTATIONS { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/annotsv:3.4.2--141a0ee560de1897' : - 'community.wave.seqera.io/library/annotsv:3.4.2--010fa21247b5b64b' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/df/df5e87ce610d595afe5f1e4017c255e53590cfa9b156c43310d637b36bfd12b3/data' : + 'community.wave.seqera.io/library/annotsv:3.4.6--c5a6da1bf4c2c8e5' }" output: path "AnnotSV_annotations", emit: annotations diff --git a/modules/nf-core/annotsv/installannotations/meta.yml b/modules/nf-core/annotsv/installannotations/meta.yml index a04c4f1d..670c7150 100644 --- a/modules/nf-core/annotsv/installannotations/meta.yml +++ b/modules/nf-core/annotsv/installannotations/meta.yml @@ -12,15 +12,22 @@ tools: documentation: "https://lbgi.fr/AnnotSV/" tool_dev_url: "https://github.com/lgmgeo/AnnotSV" licence: ["GPL v3"] + identifier: biotools:AnnotSV output: - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - annotations: - type: file - description: A folder containing the annotations - pattern: "AnnotSV_annotations" + annotations: + - AnnotSV_annotations: + type: file + description: A folder containing the annotations + pattern: "AnnotSV_annotations" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML authors: - "@nvnieuwk" maintainers: diff --git a/modules/nf-core/annotsv/installannotations/tests/main.nf.test.snap b/modules/nf-core/annotsv/installannotations/tests/main.nf.test.snap index bb45f1bd..bc1a43d7 100644 --- a/modules/nf-core/annotsv/installannotations/tests/main.nf.test.snap +++ b/modules/nf-core/annotsv/installannotations/tests/main.nf.test.snap @@ -8,7 +8,7 @@ ] ], "1": [ - "versions.yml:md5,d0b3dc5e0199653fd81ffd3754e65f9c" + "versions.yml:md5,a7ab4be6d37013bd8c0ca0e388c0a4ee" ], "annotations": [ [ @@ -16,14 +16,14 @@ ] ], "versions": [ - "versions.yml:md5,d0b3dc5e0199653fd81ffd3754e65f9c" + "versions.yml:md5,a7ab4be6d37013bd8c0ca0e388c0a4ee" ] } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.1" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2024-05-29T15:14:54.723053976" + "timestamp": "2025-07-10T16:55:38.662275457" } } \ No newline at end of file diff --git a/modules/nf-core/annotsv/installannotations/tests/tags.yml b/modules/nf-core/annotsv/installannotations/tests/tags.yml deleted file mode 100644 index 232bec39..00000000 --- a/modules/nf-core/annotsv/installannotations/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -annotsv/installannotations: - - "modules/nf-core/annotsv/installannotations/**" diff --git a/modules/nf-core/gatk4/createsequencedictionary/environment.yml b/modules/nf-core/gatk4/createsequencedictionary/environment.yml new file mode 100644 index 00000000..b562b72c --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/gatk4 + - bioconda::gatk4=4.6.1.0 + - bioconda::gcnvkernel=0.9 diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf new file mode 100644 index 00000000..998622a0 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -0,0 +1,52 @@ +process GATK4_CREATESEQUENCEDICTIONARY { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b2/b28daf5d9bb2f0d129dcad1b7410e0dd8a9b087aaf3ec7ced929b1f57624ad98/data': + 'community.wave.seqera.io/library/gatk4_gcnvkernel:e48d414933d188cd' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.dict') , emit: dict + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + def avail_mem = 6144 + if (!task.memory) { + log.info '[GATK CreateSequenceDictionary] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CreateSequenceDictionary \\ + --REFERENCE $fasta \\ + --URI $fasta \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta.baseName}.dict + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml new file mode 100644 index 00000000..72dced28 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/meta.yml @@ -0,0 +1,54 @@ +name: gatk4_createsequencedictionary +description: Creates a sequence dictionary for a reference sequence +keywords: + - createsequencedictionary + - dictionary + - fasta + - gatk4 +tools: + - gatk: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Input fasta file + pattern: "*.{fasta,fa}" + ontologies: [] +output: + dict: + - - meta: + type: file + description: gatk dictionary file + pattern: "*.{dict}" + ontologies: [] + - "*.dict": + type: file + description: gatk dictionary file + pattern: "*.{dict}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@maxulysse" + - "@ramprasadn" +maintainers: + - "@maxulysse" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test new file mode 100644 index 00000000..a8a9c6d2 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process GATK4_CREATESEQUENCEDICTIONARY" + script "../main.nf" + process "GATK4_CREATESEQUENCEDICTIONARY" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/createsequencedictionary" + + test("sarscov2 - fasta") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap new file mode 100644 index 00000000..e8a600fd --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "sarscov2 - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.dict:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,e993b2c99f7f6b0fcd8428de15c61439" + ], + "dict": [ + [ + { + "id": "test" + }, + "genome.dict:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e993b2c99f7f6b0fcd8428de15c61439" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-10-31T10:51:56.155954077" + }, + "sarscov2 - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.dict:md5,7362679f176e0f52add03c08f457f646" + ] + ], + "1": [ + "versions.yml:md5,e993b2c99f7f6b0fcd8428de15c61439" + ], + "dict": [ + [ + { + "id": "test" + }, + "genome.dict:md5,7362679f176e0f52add03c08f457f646" + ] + ], + "versions": [ + "versions.yml:md5,e993b2c99f7f6b0fcd8428de15c61439" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-10-31T10:51:45.562993875" + } +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/svannotate/environment.yml b/modules/nf-core/gatk4/svannotate/environment.yml new file mode 100644 index 00000000..b562b72c --- /dev/null +++ b/modules/nf-core/gatk4/svannotate/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/gatk4 + - bioconda::gatk4=4.6.1.0 + - bioconda::gcnvkernel=0.9 diff --git a/modules/nf-core/gatk4/svannotate/main.nf b/modules/nf-core/gatk4/svannotate/main.nf new file mode 100644 index 00000000..5d6f4fed --- /dev/null +++ b/modules/nf-core/gatk4/svannotate/main.nf @@ -0,0 +1,70 @@ +process GATK4_SVANNOTATE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b2/b28daf5d9bb2f0d129dcad1b7410e0dd8a9b087aaf3ec7ced929b1f57624ad98/data': + 'community.wave.seqera.io/library/gatk4_gcnvkernel:e48d414933d188cd' }" + + input: + tuple val(meta), path(vcf), path(tbi), path(bed), path(non_coding_bed) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + tuple val(meta4), path(dict) + tuple val(meta5), path(gtf) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def intervals = bed ? "--intervals ${bed}" : "" + def reference = fasta ? "--reference ${fasta}" : "" + def transcripts = gtf ? "--protein-coding-gtf ${gtf}" : "" + def non_coding = non_coding_bed ? "--non-coding-bed ${non_coding_bed}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK SVAnnotate] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + SVAnnotate \\ + --variant ${vcf} \\ + --output ${prefix}.vcf.gz \\ + ${intervals} \\ + ${reference} \\ + ${transcripts} \\ + ${non_coding} \\ + --tmp-dir . \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo | gzip > ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/svannotate/meta.yml b/modules/nf-core/gatk4/svannotate/meta.yml new file mode 100644 index 00000000..5e830c38 --- /dev/null +++ b/modules/nf-core/gatk4/svannotate/meta.yml @@ -0,0 +1,125 @@ +name: "gatk4_svannotate" +description: Adds predicted functional consequence, gene overlap, and noncoding + element overlap annotations to SV VCF from GATK-SV pipeline. Input files are + an SV VCF, a GTF file containing primary or canonical transcripts, and a BED + file containing noncoding elements. Output file is an annotated SV VCF. +keywords: + - annotate + - gatk4 + - structural variants + - svannotate + - vcf +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: A VCF file created with a structural variant caller + pattern: "*.vcf.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + - tbi: + type: file + description: The index file of the VCF + pattern: "*.vcf.gz.tbi" + ontologies: [] + - bed: + type: file + description: Regions to limit the analysis to + pattern: "*.bed" + ontologies: [] + - non_coding_bed: + type: file + description: File containing noncoding regions + pattern: "*.bed" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing FASTA information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Optional - reference FASTA file needed when the input is a + CRAM file + pattern: "*.{fasta,fa}" + ontologies: [] + - - meta3: + type: map + description: | + Groovy Map containing FAI information + e.g. [ id:'test', single_end:false ] + - fasta_fai: + type: file + description: Optional - index of the reference FASTA file needed when the + input is a CRAM file + pattern: "*.fai" + ontologies: [] + - - meta4: + type: map + description: | + Groovy Map containing DICT information + e.g. [ id:'test', single_end:false ] + - dict: + type: file + description: Optional - sequence dictionary of the reference FASTA file + needed when the input is a CRAM file + pattern: "*.dict" + ontologies: [] + - - meta5: + type: map + description: | + Groovy Map containing GTF information + e.g. [ id:'test', single_end:false ] + - gtf: + type: file + description: Optional - GTF file containing transcript information + pattern: "*.gtf" + ontologies: [] +output: + vcf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.vcf.gz": + type: file + description: The annotated structural variant VCF + pattern: "*.vcf.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + tbi: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.vcf.gz.tbi": + type: file + description: The index of the VCF + pattern: "*.vcf.gz.tbi" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/gatk4/svannotate/tests/main.nf.test b/modules/nf-core/gatk4/svannotate/tests/main.nf.test new file mode 100644 index 00000000..1fd9b260 --- /dev/null +++ b/modules/nf-core/gatk4/svannotate/tests/main.nf.test @@ -0,0 +1,112 @@ +nextflow_process { + + name "Test Process GATK4_SVANNOTATE" + script "../main.nf" + process "GATK4_SVANNOTATE" + + tag "modules" + tag "modules_nfcore" + tag "manta" + tag "manta/germline" + tag "gatk4" + tag "gatk4/svannotate" + + setup { + run("MANTA_GERMLINE") { + script "../../../manta/germline/main.nf" + process { + """ + input[0] = Channel.of([ [ id:'test' ], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai', checkIfExists: true), [], []]) + input[1] = Channel.value([ [ id:'test' ], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)]) + input[2] = Channel.value([ [ id:'test' ], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true)]) + input[3] = [] + """ + } + } + } + + test("homo sapiens - defaults") { + + when { + process { + """ + input[0] = MANTA_GERMLINE.out.diploid_sv_vcf.join(MANTA_GERMLINE.out.diploid_sv_vcf_tbi).map({ meta, vcf, tbi -> [ meta, vcf, tbi, [], [] ]}) + input[1] = [[:],[]] + input[2] = [[:],[]] + input[3] = [[:],[]] + input[4] = [[:],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf.collect { meta, vcf -> [ meta, file(vcf.toString()).name]}, + process.out.tbi.collect { meta, tbi -> [ meta, file(tbi.toString()).name] }, + process.out.versions + ).match() } + ) + } + } + + + test("homo sapiens - all") { + + when { + process { + """ + input[0] = MANTA_GERMLINE.out.diploid_sv_vcf + .join(MANTA_GERMLINE.out.diploid_sv_vcf_tbi) + .map { meta, vcf, tbi -> [ + meta, + vcf, + tbi, + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.bed'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.multi_intervals.bed'), + ] } + input[1] = [[id:'fasta'], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)] + input[2] = [[id:'fai'], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true)] + input[3] = [[id:'dict'], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.dict', checkIfExists: true)] + input[4] = [[id:'gtf'], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome_minimal.gtf', checkIfExists: true)] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf.collect { meta, vcf -> [ meta, file(vcf.toString()).name]}, + process.out.tbi.collect { meta, tbi -> [ meta, file(tbi.toString()).name] }, + process.out.versions + ).match() } + ) + } + } + + test("homo sapiens - defaults - stub") { + + options "-stub" + + when { + process { + """ + input[0] = MANTA_GERMLINE.out.diploid_sv_vcf.join(MANTA_GERMLINE.out.diploid_sv_vcf_tbi).map({ meta, vcf, tbi -> [ meta, vcf, tbi, [], [] ]}) + input[1] = [[:],[]] + input[2] = [[:],[]] + input[3] = [[:],[]] + input[4] = [[:],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/gatk4/svannotate/tests/main.nf.test.snap b/modules/nf-core/gatk4/svannotate/tests/main.nf.test.snap new file mode 100644 index 00000000..020125a7 --- /dev/null +++ b/modules/nf-core/gatk4/svannotate/tests/main.nf.test.snap @@ -0,0 +1,107 @@ +{ + "homo sapiens - all": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.vcf.gz" + ] + ], + [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi" + ] + ], + [ + "versions.yml:md5,2969ab236c7c7c187097a1d5f2a109e3" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-07-14T16:21:20.730000686" + }, + "homo sapiens - defaults": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.vcf.gz" + ] + ], + [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi" + ] + ], + [ + "versions.yml:md5,2969ab236c7c7c187097a1d5f2a109e3" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-07-14T16:20:55.802118653" + }, + "homo sapiens - defaults - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,2969ab236c7c7c187097a1d5f2a109e3" + ], + "tbi": [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,2969ab236c7c7c187097a1d5f2a109e3" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-07-14T16:14:58.100341593" + } +} \ No newline at end of file diff --git a/modules/nf-core/svync/environment.yml b/modules/nf-core/svync/environment.yml index cada6309..b70ebfbf 100644 --- a/modules/nf-core/svync/environment.yml +++ b/modules/nf-core/svync/environment.yml @@ -4,4 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - bioconda::svync=0.1.2 + - bioconda::svync=0.3.0 diff --git a/modules/nf-core/svync/main.nf b/modules/nf-core/svync/main.nf index 299229c6..18dbd918 100644 --- a/modules/nf-core/svync/main.nf +++ b/modules/nf-core/svync/main.nf @@ -4,8 +4,8 @@ process SVYNC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/svync:0.1.2--h9ee0642_0': - 'biocontainers/svync:0.1.2--h9ee0642_0' }" + 'https://depot.galaxyproject.org/singularity/svync:0.3.0--h9ee0642_0': + 'biocontainers/svync:0.3.0--h9ee0642_0' }" input: tuple val(meta), path(vcf), path(tbi), path(config) diff --git a/modules/nf-core/svync/meta.yml b/modules/nf-core/svync/meta.yml index 8da1fc49..2357aa15 100644 --- a/modules/nf-core/svync/meta.yml +++ b/modules/nf-core/svync/meta.yml @@ -25,17 +25,21 @@ input: type: file description: The input VCF file containing structural variants pattern: "*.{vcf,vcf.gz}" + ontologies: [] - tbi: type: file description: The index of the input VCF file containing structural variants pattern: "*.tbi" + ontologies: [] - config: type: file description: The config stating how the standardization should happen pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML output: - - vcf: - - meta: + vcf: + - - meta: type: map description: | Groovy Map containing sample information @@ -44,8 +48,10 @@ output: type: file description: The standardized VCF file pattern: "*.vcf.gz" - - tbi: - - meta: + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + tbi: + - - meta: type: map description: | Groovy Map containing sample information @@ -54,11 +60,14 @@ output: type: file description: The index of the standardized VCF file pattern: "*.tbi" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@nvnieuwk" maintainers: diff --git a/modules/nf-core/svync/tests/main.nf.test.snap b/modules/nf-core/svync/tests/main.nf.test.snap index f242ce9a..b4715e8e 100644 --- a/modules/nf-core/svync/tests/main.nf.test.snap +++ b/modules/nf-core/svync/tests/main.nf.test.snap @@ -21,15 +21,15 @@ ] ], "versions": [ - "versions.yml:md5,dd982c7896f22ebaa0ea51d00472c96c" + "versions.yml:md5,37174a343ebe85fcce767948b2d3686e" ] } ], "meta": { - "nf-test": "0.9.1", - "nextflow": "24.10.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2025-02-13T15:20:25.459243101" + "timestamp": "2025-07-16T13:46:13.393014929" }, "sarscov2 - vcf, config - stub": { "content": [ @@ -53,14 +53,14 @@ ] ], "versions": [ - "versions.yml:md5,dd982c7896f22ebaa0ea51d00472c96c" + "versions.yml:md5,37174a343ebe85fcce767948b2d3686e" ] } ], "meta": { - "nf-test": "0.9.1", - "nextflow": "24.10.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2025-02-13T15:18:08.523826702" + "timestamp": "2025-07-16T13:46:22.727686645" } } \ No newline at end of file diff --git a/modules/nf-core/svync/tests/tags.yml b/modules/nf-core/svync/tests/tags.yml deleted file mode 100644 index e63467dc..00000000 --- a/modules/nf-core/svync/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -svync: - - "modules/nf-core/svync/**" diff --git a/nextflow_schema.json b/nextflow_schema.json index bab849c9..deb02016 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,6 +73,22 @@ "pattern": "^\\S+\\.fai$", "mimetype": "text/plain" }, + "dict": { + "type": "string", + "description": "The sequence dictionary of the FASTA reference file", + "exists": true, + "format": "file-path", + "pattern": "^\\S+\\.dict$", + "mimetype": "text/plain" + }, + "gtf": { + "type": "string", + "description": "Path to GTF file for the reference genome. Gene and transcript annotations will be added when this file is provided", + "exists": true, + "format": "file-path", + "pattern": "^\\S+\\.gtf$", + "mimetype": "text/plain" + }, "expansionhunter_catalog": { "type": "string", "description": "Path to the expansionhunter catalog", diff --git a/tests/nextflow.config b/tests/nextflow.config index 332cfa01..2751dfd5 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -18,6 +18,8 @@ params { // References for test data fasta = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta" fai = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta.fai" + dict = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.dict" + gtf = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.gtf" bwa = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/bwa.tar.gz" expansionhunter_catalog = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/expansionhunter/variant_catalog.json" qdnaseq_male = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/qdnaseq/chr21.10kbp.rda" diff --git a/tests/pipeline/all_types/all.nf.test.snap b/tests/pipeline/all_types/all.nf.test.snap index e8904933..e7159c5e 100644 --- a/tests/pipeline/all_types/all.nf.test.snap +++ b/tests/pipeline/all_types/all.nf.test.snap @@ -2,7 +2,7 @@ "all": { "content": [ [ - + ], [ "PosCon1/PosCon1.cnv.vcf.gz", @@ -82,7 +82,7 @@ "concat": { "content": [ [ - + ], [ "PosCon1/PosCon1.vcf.gz", @@ -141,12 +141,12 @@ "nf-test": "0.9.2", "nextflow": "25.04.6" }, - "timestamp": "2025-07-10T16:20:18.015157817" + "timestamp": "2025-07-22T11:45:55.747550179" }, "output callers": { "content": [ [ - + ], [ "PosCon1/PosCon1.cnv.vcf.gz", @@ -255,4 +255,4 @@ }, "timestamp": "2025-07-10T16:21:41.411733092" } -} \ No newline at end of file +} diff --git a/tests/subworkflows/local/bam_sv_calling/main.nf.test.snap b/tests/subworkflows/local/bam_sv_calling/main.nf.test.snap index 1373c202..eece493f 100644 --- a/tests/subworkflows/local/bam_sv_calling/main.nf.test.snap +++ b/tests/subworkflows/local/bam_sv_calling/main.nf.test.snap @@ -15,7 +15,7 @@ ] ], "reports": [ - + ], "vcfs": [ [ @@ -53,7 +53,7 @@ ] ], "reports": [ - + ], "vcfs": [ [ @@ -63,17 +63,17 @@ "sex": "male", "variant_type": "sv" }, - "test.svync.manta.vcf.gz,variantsMD5:73d9618ba3a50341575d3c9c1d2796ab", + "test.svync.manta.vcf.gz,variantsMD5:a25dede957f135ec2f76266f06cac512", "test.svync.manta.vcf.gz.tbi" ] ] } ], "meta": { - "nf-test": "0.9.1", - "nextflow": "24.10.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2025-02-13T17:25:48.450406346" + "timestamp": "2025-07-22T12:13:19.296439771" }, "homo_sapiens - delly": { "content": [ @@ -91,7 +91,7 @@ ] ], "reports": [ - + ], "vcfs": [ [ @@ -101,17 +101,17 @@ "sex": "male", "variant_type": "sv" }, - "test.svync.delly.vcf.gz,variantsMD5:caf4116f675c20783b0119c98e70f2ea", + "test.svync.delly.vcf.gz,variantsMD5:e15d4c5f477f8e86b4d077da42b9de83", "test.svync.delly.vcf.gz.tbi" ] ] } ], "meta": { - "nf-test": "0.9.1", - "nextflow": "24.10.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2025-02-13T17:26:06.285138049" + "timestamp": "2025-07-22T12:14:02.716091083" }, "homo_sapiens - all sv callers": { "content": [ @@ -149,7 +149,7 @@ ] ], "reports": [ - + ], "vcfs": [ [ @@ -159,16 +159,16 @@ "sex": "male", "variant_type": "sv" }, - "test.sv.vcf.gz,variantsMD5:2d626bce1f675bc7c41ec59d9a09a7aa", + "test.sv.vcf.gz,variantsMD5:fb4b0ff0cd5dcb247c0fe1fbcb246cc5", "test.sv.vcf.gz.tbi" ] ] } ], "meta": { - "nf-test": "0.9.1", - "nextflow": "24.10.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2025-02-13T17:26:51.327526373" + "timestamp": "2025-07-22T12:15:39.713604515" } -} \ No newline at end of file +} diff --git a/tests/subworkflows/local/bam_variant_calling_delly/main.nf.test.snap b/tests/subworkflows/local/bam_variant_calling_delly/main.nf.test.snap index 70dcc22a..140a46e3 100644 --- a/tests/subworkflows/local/bam_variant_calling_delly/main.nf.test.snap +++ b/tests/subworkflows/local/bam_variant_calling_delly/main.nf.test.snap @@ -10,7 +10,7 @@ "sex": "male", "caller": "delly" }, - "test.svync.delly.vcf.gz,variantsMD5:caf4116f675c20783b0119c98e70f2ea", + "test.svync.delly.vcf.gz,variantsMD5:e15d4c5f477f8e86b4d077da42b9de83", "test.svync.delly.vcf.gz.tbi" ] ], @@ -29,9 +29,9 @@ } ], "meta": { - "nf-test": "0.9.1", - "nextflow": "24.10.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2025-02-13T16:40:54.037387408" + "timestamp": "2025-07-22T12:16:17.594443126" } -} \ No newline at end of file +} diff --git a/tests/subworkflows/local/bam_variant_calling_manta/main.nf.test.snap b/tests/subworkflows/local/bam_variant_calling_manta/main.nf.test.snap index 29e4a231..879236d5 100644 --- a/tests/subworkflows/local/bam_variant_calling_manta/main.nf.test.snap +++ b/tests/subworkflows/local/bam_variant_calling_manta/main.nf.test.snap @@ -5,16 +5,16 @@ "##contig=", "##contig=", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tPosCon1", - "chr16\t1061455\tmanta_DEL_1\tGTTCCACTCATGTTGTTGCAGATTACTGGATCTCACTCTTTTTTATGGGTGAATCGTACTTCATAGTGTATATGTACC\tG\t619\tPASS\tCALLERS=manta;CIPOS=0,5;END=1061532;SVLEN=-77;SVTYPE=DEL\tGT:PE:SR\t1/1:0,0:0,19" + "chr16\t1061455\tmanta_DEL_1\tGTTCCACTCATGTTGTTGCAGATTACTGGATCTCACTCTTTTTTATGGGTGAATCGTACTTCATAGTGTATATGTACC\t\t619\tPASS\tCALLERS=manta;CIPOS=0,5;END=1061532;SVLEN=-77;SVTYPE=DEL\tGT:PE:SR\t1/1:0,0:0,19" ], "test.svync.manta.vcf.gz.tbi", 6, 0 ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.02.0" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2024-03-11T14:12:51.499950856" + "timestamp": "2025-07-22T12:16:57.486074419" } -} \ No newline at end of file +} diff --git a/workflows/structural.nf b/workflows/structural.nf index c899bd54..66527af1 100644 --- a/workflows/structural.nf +++ b/workflows/structural.nf @@ -37,6 +37,8 @@ include { VCF_MERGE_FAMILY_JASMINE } from '../subworkflows/local/vc // MODULE: Installed directly from nf-core/modules // include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { GATK4_CREATESEQUENCEDICTIONARY } from '../modules/nf-core/gatk4/createsequencedictionary/main' +include { PREPROCESS_GTF } from '../modules/local/preprocess_gtf/main' include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' include { ENSEMBLVEP_DOWNLOAD } from '../modules/nf-core/ensemblvep/download/main' include { ANNOTSV_INSTALLANNOTATIONS } from '../modules/nf-core/annotsv/installannotations/main' @@ -45,6 +47,7 @@ include { UNTAR as UNTAR_BWA } from '../modules/nf-core/untar/mai include { NGSBITS_SAMPLEGENDER } from '../modules/nf-core/ngsbits/samplegender/main' include { BCFTOOLS_FILTER } from '../modules/nf-core/bcftools/filter/main' include { SVTOOLS_VCFTOBEDPE } from '../modules/nf-core/svtools/vcftobedpe/main' +include { GATK4_SVANNOTATE } from '../modules/nf-core/gatk4/svannotate/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' /* @@ -66,6 +69,8 @@ workflow STRUCTURAL { // file inputs fasta // The fasta file to use fai // The index of the fasta file + dict // The dictionary of the fasta file + gtf // The GTF file to use for annotation expansionhunter_catalog // The expansionhunter catalog qdnaseq_female // The QDNAseq annotations for female samples qdnaseq_male // The QDNAseq annotations for male samples @@ -200,6 +205,32 @@ workflow STRUCTURAL { ch_fai = Channel.fromPath(fai).collect { fai_file -> [[id:'fai'], fai_file] } } + def ch_dict = Channel.empty() + // Dictionary is only needed for GATK4_SVANNOTATE + if(!dict && gtf) { + GATK4_CREATESEQUENCEDICTIONARY( + ch_fasta + ) + ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) + + ch_dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict.collect() + } + else if(dict) { + ch_dict = Channel.fromPath(dict).collect { dict_file -> [[id:'dict'], dict_file] } + } + + def ch_preprocessed_gtf = Channel.empty() + // Sanitize GTF file to adhere to the extremely strict GTF parsing in SVAnnotate + if (gtf) { + ch_sanitize_input = Channel.fromPath(gtf).collect { gtf_file -> [[id:'gtf'], gtf_file] } + PREPROCESS_GTF( + ch_sanitize_input + ) + ch_versions = ch_versions.mix(PREPROCESS_GTF.out.versions) + + ch_preprocessed_gtf = PREPROCESS_GTF.out.gtf.collect() + } + // if(!bwa && "gridss" in callers){ // BWA_INDEX( // ch_fasta @@ -427,16 +458,35 @@ workflow STRUCTURAL { ch_vcfanno_output = ch_annotation_input } - def ch_outputs = Channel.empty() + def ch_filter_outputs = Channel.empty() if(filter) { BCFTOOLS_FILTER( ch_vcfanno_output ) - def ch_filter_output = BCFTOOLS_FILTER.out.vcf.join(BCFTOOLS_FILTER.out.tbi, failOnMismatch:true, failOnDuplicate:true) - ch_outputs = ch_outputs.mix(ch_filter_output) + ch_filter_outputs = BCFTOOLS_FILTER.out.vcf.join(BCFTOOLS_FILTER.out.tbi, failOnMismatch:true, failOnDuplicate:true) ch_versions = ch_versions.mix(BCFTOOLS_FILTER.out.versions) } else { - ch_outputs = ch_outputs.mix(ch_vcfanno_output) + ch_filter_outputs = ch_vcfanno_output + } + + def ch_outputs = Channel.empty() + if(gtf) { + def ch_svannotate_input = ch_filter_outputs + .map { meta, vcf, tbi -> + [ meta, vcf, tbi, [], [] ] // TODO add BED files + } + + GATK4_SVANNOTATE( + ch_svannotate_input, + ch_fasta, + ch_fai, + ch_dict, + ch_preprocessed_gtf + ) + ch_versions = ch_versions.mix(GATK4_SVANNOTATE.out.versions.first()) + ch_outputs = ch_outputs.mix(GATK4_SVANNOTATE.out.vcf.join(GATK4_SVANNOTATE.out.tbi, failOnMismatch:true, failOnDuplicate:true)) + } else { + ch_outputs = ch_outputs.mix(ch_filter_outputs) } //