nf-cmgg
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎assets/svync/delly.yaml‎
Lines changed: 3 additions & 1 deletion b/‎assets/svync/delly.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎assets/svync/manta.yaml‎
Lines changed: 2 additions & 0 deletions b/‎assets/svync/manta.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bin/preprocess_gtf.py‎
Lines changed: 76 additions & 0 deletions b/‎bin/preprocess_gtf.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎conf/modules.config‎
Lines changed: 4 additions & 0 deletions b/‎conf/modules.config‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎conf/test.config‎
Lines changed: 2 additions & 0 deletions b/‎conf/test.config‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎main.nf‎
Lines changed: 4 additions & 0 deletions b/‎main.nf‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎modules.json‎
Lines changed: 11 additions & 1 deletion b/‎modules.json‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎modules/local/preprocess_gtf/environment.yml‎
Lines changed: 5 additions & 0 deletions b/‎modules/local/preprocess_gtf/environment.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎modules/local/preprocess_gtf/main.nf‎
Lines changed: 40 additions & 0 deletions b/‎modules/local/preprocess_gtf/main.nf‎
Lines changed: 40 additions & 0 deletions
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 1. Added the samplesheet to the pipeline output as `OUTDIR/samplesheet.csv`
 2. Added the `--bedpe` parameter. This makes the pipeline output BEDPE files alongside the VCF files.
 3. Added parallelization on SV type to the delly flow
+4. Added a `--gtf` parameter for annotation of gene and transcript overlap using `gatk SVAnnotate`.
 
 ### `Changes`
 
 
@@ -1,6 +1,8 @@
 id: delly_$INFO/SVTYPE
 alt:
-  BND: TRA
+  alts:
+    BND: <TRA>
+  value: <$INFO/SVTYPE>
 info:
   CALLERS:
     value: delly
 
@@ -1,4 +1,6 @@
 id: manta_$INFO/SVTYPE
+alt:
+  value: <$INFO/SVTYPE>
 info:
   CALLERS:
     value: manta
 
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# https://github.com/broadinstitute/gatk-sv/blob/main/scripts/inputs/preprocess_gtf.py
+
+"""
+Preprocess GENCODE basic GTF to extract canonical protein-coding transcripts for functional consequence annotation.
+"""
+
+import argparse
+import gzip
+
+
+CHROM_FIELD = 0
+ELEMENT_FIELD = 2
+ATTRIBUTES_FIELD = 8
+TRANSCRIPT_TYPES = {"protein_coding", "nonsense_mediated_decay"}
+CANONICAL = {"MANE_Plus_Clinical", "MANE_Select", "Ensembl_canonical"}
+
+
+# Flexibly open .gz or uncompressed file to read
+def _open(filename):
+    if filename.endswith(".gz"):
+        return gzip.open(filename, 'rt')
+    else:
+        return open(filename, 'r')
+
+
+# Extract transcript type and canonical status
+def parse_attributes(field):
+    # format: key1 "value1"; key2 "value2";
+    # keys may be repeated so cannot convert directly to dictionary
+    attributes_list = [tuple(x.replace('"', '').split(' ')) for x in field.rstrip(";").split("; ")]
+    protein = False
+    canonical = False
+    for key, val in attributes_list:
+        if key == "tag" and val in CANONICAL:
+            canonical = True
+        elif key == "transcript_type" and val in TRANSCRIPT_TYPES:
+            protein = True
+    return protein, canonical
+
+
+def process(gtf, outfile):
+    with _open(gtf) as inp, open(outfile, 'w') as out:
+        gene_line = ""
+        for line in inp:
+            if line.startswith("#"):
+                continue
+            fields = line.rstrip('\n').split('\t')
+
+            # Drop mitochondria
+            if fields[CHROM_FIELD] == 'chrM':
+                continue
+
+            # Store gene line to print if transcript is eligible
+            if fields[ELEMENT_FIELD] == "gene":
+                gene_line = line
+                continue
+
+            # Select protein-coding and canonical transcripts only
+            protein, canonical = parse_attributes(fields[ATTRIBUTES_FIELD])
+            if protein and canonical:
+                out.write(gene_line + line)
+                gene_line = ""  # only print gene line before first transcript line
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('gtf', help="Input GTF from GENCODE")
+    parser.add_argument('outfile', help="Output filename")
+    args = parser.parse_args()
+
+    process(args.gtf, args.outfile)
+
+
+if __name__ == '__main__':
+    main()
@@ -259,6 +259,10 @@ process {
         ext.args = "-ends"
     }
 
+    withName: "^.*GATK4_SVANNOTATE\$" {
+        ext.prefix = {"${meta.id}.${meta.variant_type}.svannotate"}
+    }
+
     /*
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         SV AND CNV FILTERING
 
@@ -33,6 +33,8 @@ params {
     // Fasta references
     fasta                   = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta"
     fai                     = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta.fai"
+    dict                    = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.dict"
+    gtf                     = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.gtf"
     // bwa                     = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/bwa.tar.gz"
     expansionhunter_catalog = params.test_data["homo_sapiens"]["genome"]["expansionhunter"]
     qdnaseq_male            = params.test_data["homo_sapiens"]["genome"]["genome_qdnaseq"]
 
@@ -28,6 +28,8 @@ include { getGenomeAttribute      } from './subworkflows/local/utils_nfcore_stru
 
 params.fasta                    = getGenomeAttribute('fasta')
 params.fai                      = getGenomeAttribute('fai')
+params.dict                     = getGenomeAttribute('dict')
+params.gtf                      = getGenomeAttribute('gtf')
 params.vep_cache                = getGenomeAttribute('vep_cache')
 // params.bwa                      = getGenomeAttribute('bwa')
 params.annotsv_annotations      = getGenomeAttribute('annotsv_annotations')
@@ -81,6 +83,8 @@ workflow {
         // files
         params.fasta,
         params.fai,
+        params.dict,
+        params.gtf,
         params.expansionhunter_catalog ?: "https://github.com/Illumina/ExpansionHunter/raw/master/variant_catalog/grch38/variant_catalog.json",
         params.qdnaseq_female,
         params.qdnaseq_male,
 
@@ -66,6 +66,16 @@
                         "git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5",
                         "installed_by": ["modules"]
                     },
+                    "gatk4/createsequencedictionary": {
+                        "branch": "master",
+                        "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/svannotate": {
+                        "branch": "master",
+                        "git_sha": "cc7e281e7877146dac79c5a484e6e2b10086234a",
+                        "installed_by": ["modules"]
+                    },
                     "gawk": {
                         "branch": "master",
                         "git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5",
@@ -135,7 +145,7 @@
                     },
                     "svync": {
                         "branch": "master",
-                        "git_sha": "916a4cbc4f831d501860495b157c4857833e22a7",
+                        "git_sha": "0fc190096fa8dcc9878cef178479f22e03f174a1",
                         "installed_by": ["modules"]
                     },
                     "tabix/bgziptabix": {
 
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::python=3.13.5
@@ -0,0 +1,40 @@
+process PREPROCESS_GTF {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8a/8ad257d53c2a2b8810d2b12d4d8e3ea438bc8c4a6be7c39b0354cd7bb8d5c260/data':
+        'community.wave.seqera.io/library/python:3.13.5--18032a8dc5d4b91e' }"
+
+    input:
+    tuple val(meta), path(gtf)
+
+    output:
+    tuple val(meta), path("*.sanitized.gtf"), emit: gtf
+    path "versions.yml"                     , emit: versions
+
+    script:
+    def prefix  = task.ext.prefix ?: "${gtf.baseName}"
+
+    """
+    preprocess_gtf.py $gtf ${prefix}.sanitized.gtf
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${gtf.baseName}"
+
+    """
+    touch ${prefix}.sanitized.gtf
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//')
+    END_VERSIONS
+    """
+}