Skip to content

Commit c1e8221

Browse files
authored
Add GATK4_SVANNOTATE (#117)
* fix annotsv issues * add svannotate * update svync for compatilbility * fix test file locations * fix svannotate output name * update changelog * fix tests
1 parent 2ac8cbf commit c1e8221

32 files changed

Lines changed: 943 additions & 60 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
1. Added the samplesheet to the pipeline output as `OUTDIR/samplesheet.csv`
1111
2. Added the `--bedpe` parameter. This makes the pipeline output BEDPE files alongside the VCF files.
1212
3. Added parallelization on SV type to the delly flow
13+
4. Added a `--gtf` parameter for annotation of gene and transcript overlap using `gatk SVAnnotate`.
1314

1415
### `Changes`
1516

assets/svync/delly.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
id: delly_$INFO/SVTYPE
22
alt:
3-
BND: TRA
3+
alts:
4+
BND: <TRA>
5+
value: <$INFO/SVTYPE>
46
info:
57
CALLERS:
68
value: delly

assets/svync/manta.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
id: manta_$INFO/SVTYPE
2+
alt:
3+
value: <$INFO/SVTYPE>
24
info:
35
CALLERS:
46
value: manta

bin/preprocess_gtf.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python
2+
# https://github.com/broadinstitute/gatk-sv/blob/main/scripts/inputs/preprocess_gtf.py
3+
4+
"""
5+
Preprocess GENCODE basic GTF to extract canonical protein-coding transcripts for functional consequence annotation.
6+
"""
7+
8+
import argparse
9+
import gzip
10+
11+
12+
CHROM_FIELD = 0
13+
ELEMENT_FIELD = 2
14+
ATTRIBUTES_FIELD = 8
15+
TRANSCRIPT_TYPES = {"protein_coding", "nonsense_mediated_decay"}
16+
CANONICAL = {"MANE_Plus_Clinical", "MANE_Select", "Ensembl_canonical"}
17+
18+
19+
# Flexibly open .gz or uncompressed file to read
20+
def _open(filename):
21+
if filename.endswith(".gz"):
22+
return gzip.open(filename, 'rt')
23+
else:
24+
return open(filename, 'r')
25+
26+
27+
# Extract transcript type and canonical status
28+
def parse_attributes(field):
29+
# format: key1 "value1"; key2 "value2";
30+
# keys may be repeated so cannot convert directly to dictionary
31+
attributes_list = [tuple(x.replace('"', '').split(' ')) for x in field.rstrip(";").split("; ")]
32+
protein = False
33+
canonical = False
34+
for key, val in attributes_list:
35+
if key == "tag" and val in CANONICAL:
36+
canonical = True
37+
elif key == "transcript_type" and val in TRANSCRIPT_TYPES:
38+
protein = True
39+
return protein, canonical
40+
41+
42+
def process(gtf, outfile):
43+
with _open(gtf) as inp, open(outfile, 'w') as out:
44+
gene_line = ""
45+
for line in inp:
46+
if line.startswith("#"):
47+
continue
48+
fields = line.rstrip('\n').split('\t')
49+
50+
# Drop mitochondria
51+
if fields[CHROM_FIELD] == 'chrM':
52+
continue
53+
54+
# Store gene line to print if transcript is eligible
55+
if fields[ELEMENT_FIELD] == "gene":
56+
gene_line = line
57+
continue
58+
59+
# Select protein-coding and canonical transcripts only
60+
protein, canonical = parse_attributes(fields[ATTRIBUTES_FIELD])
61+
if protein and canonical:
62+
out.write(gene_line + line)
63+
gene_line = "" # only print gene line before first transcript line
64+
65+
66+
def main():
67+
parser = argparse.ArgumentParser()
68+
parser.add_argument('gtf', help="Input GTF from GENCODE")
69+
parser.add_argument('outfile', help="Output filename")
70+
args = parser.parse_args()
71+
72+
process(args.gtf, args.outfile)
73+
74+
75+
if __name__ == '__main__':
76+
main()

conf/modules.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ process {
259259
ext.args = "-ends"
260260
}
261261

262+
withName: "^.*GATK4_SVANNOTATE\$" {
263+
ext.prefix = {"${meta.id}.${meta.variant_type}.svannotate"}
264+
}
265+
262266
/*
263267
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
264268
SV AND CNV FILTERING

conf/test.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ params {
3333
// Fasta references
3434
fasta = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta"
3535
fai = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.fasta.fai"
36+
dict = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.dict"
37+
gtf = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/reference.gtf"
3638
// bwa = "https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/genome/seq/SVcontrol/bwa.tar.gz"
3739
expansionhunter_catalog = params.test_data["homo_sapiens"]["genome"]["expansionhunter"]
3840
qdnaseq_male = params.test_data["homo_sapiens"]["genome"]["genome_qdnaseq"]

main.nf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_stru
2828

2929
params.fasta = getGenomeAttribute('fasta')
3030
params.fai = getGenomeAttribute('fai')
31+
params.dict = getGenomeAttribute('dict')
32+
params.gtf = getGenomeAttribute('gtf')
3133
params.vep_cache = getGenomeAttribute('vep_cache')
3234
// params.bwa = getGenomeAttribute('bwa')
3335
params.annotsv_annotations = getGenomeAttribute('annotsv_annotations')
@@ -81,6 +83,8 @@ workflow {
8183
// files
8284
params.fasta,
8385
params.fai,
86+
params.dict,
87+
params.gtf,
8488
params.expansionhunter_catalog ?: "https://github.com/Illumina/ExpansionHunter/raw/master/variant_catalog/grch38/variant_catalog.json",
8589
params.qdnaseq_female,
8690
params.qdnaseq_male,

modules.json

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@
6666
"git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5",
6767
"installed_by": ["modules"]
6868
},
69+
"gatk4/createsequencedictionary": {
70+
"branch": "master",
71+
"git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
72+
"installed_by": ["modules"]
73+
},
74+
"gatk4/svannotate": {
75+
"branch": "master",
76+
"git_sha": "cc7e281e7877146dac79c5a484e6e2b10086234a",
77+
"installed_by": ["modules"]
78+
},
6979
"gawk": {
7080
"branch": "master",
7181
"git_sha": "b42fec6f7c6e5d0716685cabb825ef6bf6e386b5",
@@ -135,7 +145,7 @@
135145
},
136146
"svync": {
137147
"branch": "master",
138-
"git_sha": "916a4cbc4f831d501860495b157c4857833e22a7",
148+
"git_sha": "0fc190096fa8dcc9878cef178479f22e03f174a1",
139149
"installed_by": ["modules"]
140150
},
141151
"tabix/bgziptabix": {
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- conda-forge::python=3.13.5
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
process PREPROCESS_GTF {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8a/8ad257d53c2a2b8810d2b12d4d8e3ea438bc8c4a6be7c39b0354cd7bb8d5c260/data':
8+
'community.wave.seqera.io/library/python:3.13.5--18032a8dc5d4b91e' }"
9+
10+
input:
11+
tuple val(meta), path(gtf)
12+
13+
output:
14+
tuple val(meta), path("*.sanitized.gtf"), emit: gtf
15+
path "versions.yml" , emit: versions
16+
17+
script:
18+
def prefix = task.ext.prefix ?: "${gtf.baseName}"
19+
20+
"""
21+
preprocess_gtf.py $gtf ${prefix}.sanitized.gtf
22+
23+
cat <<-END_VERSIONS > versions.yml
24+
"${task.process}":
25+
grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//')
26+
END_VERSIONS
27+
"""
28+
29+
stub:
30+
def prefix = task.ext.prefix ?: "${gtf.baseName}"
31+
32+
"""
33+
touch ${prefix}.sanitized.gtf
34+
35+
cat <<-END_VERSIONS > versions.yml
36+
"${task.process}":
37+
grep: \$(echo \$(grep --version) | sed -e 's/grep (GNU grep) //;s/ Copyright.*//')
38+
END_VERSIONS
39+
"""
40+
}

0 commit comments

Comments
 (0)